In [1]:
import requests
import lxml
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import re

In [2]:
def load_to_csv(df: pd.DataFrame):
    df.to_csv('dataset.csv', index=False)

In [3]:
def ranges_splitter(ranges):
    ranges = ranges.replace(" ","")
    min, max = ranges.split('..')
    return min.replace(",", ""), max.replace(",", "")

In [7]:
def get_data_from_steam(steam_store_link: str, s: requests.Session):
    only_p = SoupStrainer('div', attrs={"id": "userReviews"})
    request = s.get(steam_store_link)
    while request.status_code != 200:
        request = requests.get(steam_store_link)
    soup = BeautifulSoup(request.content, 'lxml', parse_only=only_p)
    summary_columns = soup("div", attrs={"class": "summary column"})
    if len(summary_columns) == 1:
        s = summary_columns[0].get_text()
        if " ".join(s.split()) == 'No user reviews':
            return False
        if 'Need more user reviews to generate a score' in  " ".join(s.split()):
            return False
        spans = summary_columns[0].findAll("span")
    else:
        spans = summary_columns[1].findAll("span")
    
    temp = spans[1].get_text()
    temp = re.findall(r"[\d']+", temp)
    max_reviewers = ''.join(temp)
    if len(spans) == 3:
        review_perc = int(spans[2].get_text().replace(" ", "").split('%')[0].split('-')[1])
    else:
        review_perc = 0
    
    return review_perc, max_reviewers

In [4]:
def get_data_from_steamspy_game_page(app_id: str, s: requests.Session):
    url = f"https://steamspy.com/{app_id}"
    only_p = SoupStrainer('div', attrs={"class": "panel-body"})
    genre_num = 0
    genres = ''
    request = s.get(url)
    soup = BeautifulSoup(request.content, 'lxml', parse_only=only_p)
    p = soup("p")[0]
    only_a = p.findAll("a")
    for element in only_a:
        if '/tag/Software' in element['href']: #detects if a game is a software
            return False #not a game
        
        if element.get_text() == 'Store':
             steam_store_link = element['href']
             result = get_data_from_steam(steam_store_link, s)
             if result == False:
                 return False
             else:
                 review_perc, max_reviewers = result
        
        if '/genre/' in element['href']:
            if genre_num == 0:
                genres += element.get_text()
                genre_num += 1
            else:
                genres += ', ' + element.get_text()
    return genres, review_perc, max_reviewers

In [8]:
def get_data_from_steamspy_into_dataframe(years: list):
    s = requests.Session()
    for year in years:
        url = f"https://steamspy.com/year/{year}"
        game_names = list()
        developer = list()
        minimum_owners = list()
        maximum_owners = list()
        prices = list()
        genres_list = list()
        review_percs = list()
        max_reviewers_list = list()
        request = s.get(url)
        only_the_table = SoupStrainer("table")
        soup = BeautifulSoup(request.content, 'lxml', parse_only=only_the_table)
        tbl = soup("table", attrs={"class": "table"})[0]
        for row in tbl("tr"):
            cells = row("td")
            if len(cells) > 0:
                # Steam deck is a hardware and not a game
                if 'Steam Deck' in cells[1].a.get_text():
                    continue
                result = get_data_from_steamspy_game_page(cells[1].a['href'], s)
                if result == False:
                    continue
                else:
                    genres, review_perc, max_reviewers = result
                print(cells[1].a.get_text())
                game_names.append(cells[1].a.get_text())
                developer.append(cells[7].get_text())
                min, max = ranges_splitter(cells[5].get_text())
                price = cells[3].get_text()
                if price == 'N/A':
                    price = None
                    prices.append(price)
                elif price == 'Free':
                    price = 0
                    prices.append(price)
                else:
                    price = price.split('$')[1]
                    prices.append(float(price))
                minimum_owners.append(int(min))
                maximum_owners.append(int(max))
                genres_list.append(genres)
                review_percs.append(float(review_perc / 100))
                max_reviewers_list.append(max_reviewers)
    
    df = pd.DataFrame({'Game Name': game_names,
                       'Developer': developer,
                       'Release year': year,
                       'Price': prices,
                       'Genres': genres_list,
                       'Review %': review_percs,
                       'Max Reviewers': max_reviewers_list,
                       'Minimum Owners': minimum_owners,
                       'Maximum Owners': maximum_owners})
    return df


In [None]:
df = get_data_from_steamspy_into_dataframe([2021, 2020])
load_to_csv(df)


In [None]:
df = pd.read_csv('./dataset.csv')
df