In [1]:
import requests
import lxml
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import re

In [2]:
def load_to_csv(df: pd.DataFrame):
    df.to_csv('dataset.csv', index=False)

In [3]:
def ranges_splitter(ranges):
    ranges = ranges.replace(" ","")
    min, max = ranges.split('..')
    return min.replace(",", ""), max.replace(",", "")

In [11]:
def get_data_from_steam(steam_store_link: str):
    only_p = SoupStrainer('div', attrs={"id": "userReviews"})
    request = requests.get(steam_store_link)
    soup = BeautifulSoup(request.content, 'lxml', parse_only=only_p)
    summary_columns = soup("div", attrs={"class": "summary column"})
    if len(summary_columns) == 1:
        s = summary_columns[0].get_text()
        if " ".join(s.split()) == 'No user reviews':
            return 0, 0
        spans = summary_columns[0].findAll("span")
    else:
        spans = summary_columns[1].findAll("span")
    
    temp = spans[1].get_text()
    temp = re.findall(r"[\d']+", temp)
    max_reviewers = ''.join(temp)
    if len(spans) == 3:
        review_perc = int(spans[2].get_text().replace(" ", "").split('%')[0].split('-')[1])
    else:
        review_perc = 0
    
    return review_perc, max_reviewers

In [14]:
def get_data_from_steamspy_game_page(app_id: str):
    url = f"https://steamspy.com/{app_id}"
    only_p = SoupStrainer('div', attrs={"class": "panel-body"})
    genre_num = 0
    genres = ''
    request = requests.get(url)
    soup = BeautifulSoup(request.content, 'lxml', parse_only=only_p)
    p = soup("p")[0]
    only_a = p.findAll("a")
    for element in only_a:
        if '/tag/Software' in element['href']: #detects if a game is a software
            return False #not a game
        
        if element.get_text() == 'Store':
             steam_store_link = element['href']
             review_perc, max_reviewers = get_data_from_steam(steam_store_link)
        
        if '/genre/' in element['href']:
            if genre_num == 0:
                genres += element.get_text()
                genre_num += 1
            else:
                genres += ', ' + element.get_text()
    return genres, review_perc, max_reviewers

In [17]:
def get_data_from_steamspy_into_dataframe(year: int):
    url = f"https://steamspy.com/year/{year}"
    game_names = list()
    developer = list()
    minimum_owners = list()
    maximum_owners = list()
    prices = list()
    genres_list = list()
    review_percs = list()
    max_reviewers_list = list()
    request = requests.get(url)
    only_the_table = SoupStrainer("table")
    soup = BeautifulSoup(request.content, 'lxml', parse_only=only_the_table)
    tbl = soup("table", attrs={"class":"table"})[0]
    for row in tbl("tr"):
        cells = row("td")
        if len(cells) > 0:
            if 'Steam Deck' in cells[1].a.get_text(): #Steam deck is a hardware and not a game
                continue
            result = get_data_from_steamspy_game_page(cells[1].a['href'])
            if result == False:
                continue
            else:
               genres, review_perc, max_reviewers = result
            print(cells[1].a.get_text())
            game_names.append(cells[1].a.get_text())
            developer.append(cells[7].get_text())
            min, max = ranges_splitter(cells[5].get_text())
            price = cells[3].get_text()
            if price == 'N/A':
                price = None
                prices.append(price)
            elif price == 'Free':
                price = 0
                prices.append(price)
            else:
                price = price.split('$')[1]
                prices.append(float(price))
            minimum_owners.append(int(min))
            maximum_owners.append(int(max))    
            genres_list.append(genres)
            review_percs.append(float(review_perc / 100))
            max_reviewers_list.append(max_reviewers)
            
    df = pd.DataFrame({ 'Game Name': game_names, 
                        'Developer': developer,
                        'Release year': year,
                        'Price': prices,
                        'Genres': genres_list,
                        'Review %': review_percs,
                        'Max Reviewers': max_reviewers_list,
                        'Minimum Owners': minimum_owners, 
                        'Maximum Owners': maximum_owners})
    return df

In [18]:
df = get_data_from_steamspy_into_dataframe(2021)
load_to_csv(df)

 OPUS: Echo of Starsong
 Chicory: A Colorful Tale
 Psychonauts 2
 It Takes Two
 Lacuna – A Sci-Fi Noir Adventure
 Blind Drive
 Ragnarock
 Wildermyth
 Mini Motorways
 ENDER LILIES: Quietus of the Knights
 DEATHLOOP
 F1 2021
 Mass Effect Legendary Edition
 Nioh 2 – The Complete Edition
 The Last Friend
 Cook-Out
 Flynn: Son of Crimson
 HUNTDOWN
 Death's Door
 GUILTY GEAR -STRIVE-
 The Legend of Tianding
 Atelier Ryza 2: Lost Legends & the Secret Fairy
 The Forgotten City
 Unpacking
 Everhood
 Griftlands
 Tales of Arise
 Astalon: Tears of the Earth
 UNSIGHTED
 Fuga: Melodies of Steel
 The Darkside Detective: A Fumble in the Dark
 Sunlight
 Papetura
 Demeo
 Trials of Fire
 The Great Ace Attorney Chronicles
 The Riftbreaker
 Pathfinder: Wrath of the Righteous
 Clone Drone in the Danger Zone
 Little Nightmares II
 Resident Evil Village
 Ruined King: A League of Legends Story
 Eastward
 Rainbow Billy: The Curse of the Leviathan
 Bonfire Peaks
 Star Drift Evolution
 Kaze and the Wild Masks
 Cy

In [20]:
df = pd.read_csv('./dataset.csv')
df

Unnamed: 0,Game Name,Developer,Release year,Price,Genres,Review %,Max Reviewers,Minimum Owners,Maximum Owners
0,OPUS: Echo of Starsong,SIGONO INC.,2021,17.99,"Adventure, Indie",0.97,2984.0,50000,100000
1,Chicory: A Colorful Tale,"Greg Lobanov, Alexis Dean-Jones, Lena Raine, M...",2021,19.99,"Adventure, Indie, RPG",0.98,916.0,0,20000
2,Psychonauts 2,Double Fine Productions,2021,59.99,"Action, Adventure",0.98,4621.0,100000,200000
3,It Takes Two,Hazelight,2021,39.99,"Action, Adventure",0.96,52183.0,2000000,5000000
4,Lacuna – A Sci-Fi Noir Adventure,DigiTales Interactive,2021,9.59,"Adventure, Indie",0.94,596.0,200000,500000
...,...,...,...,...,...,...,...,...,...
10291,Overcooked! All You Can Eat,"Team17 Digital, Ghost Town Games",2021,19.99,"Casual, Indie, Simulation, Strategy",0.69,1017.0,100000,200000
10292,Shadow Tactics: Aiko's Choice,Mimimi Games,2021,19.99,"Indie, Strategy",0.91,964.0,50000,100000
10293,Warbox,Mantaliss,2021,9.99,"Action, Early Access",0.85,1047.0,20000,50000
10294,Across the Obelisk,Dreamsite Games,2021,17.99,"Adventure, Indie, RPG, Strategy, Early Access",0.92,907.0,20000,50000
