In [1]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("display.max_columns",100)

In [2]:
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)

        for i in range(5, 0, -1):
            print(f'\rWaiting... ({i})', end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)

        # recusively try again
        return get_request(url, parameters)

    if response:
        return response.json()
    # response is none usually means too many requests. Wait and try again 
    print('No response, waiting 10 seconds...')
    time.sleep(10)
    print('Retrying.')
    return get_request(url, parameters)

In [11]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
# app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export disabled to keep consistency across download sessions
steam_spy_all.to_csv('/content/sample_data/app.csv', index=False)

# instead read from stored csv
app_list = pd.read_csv('/content/sample_data/app.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
0,1619450,Heart of a Warrior,Techworld Communication,Techworld Communication,,0,1,0,"2,000,000 .. 5,000,000",0,0,0,0,1099,1099,0,0
1,1380400,Saving Grace,Thomas JC Ironmonger,Ironmonger Games,,28,2,0,"1,000,000 .. 2,000,000",0,0,0,0,199,199,0,0
2,1887990,Deep Night Runner,DODRECK,kazakovstudios,,14,2,0,"1,000,000 .. 2,000,000",0,0,0,0,249,2499,90,1
3,715210,The Pillage,Bishop Armstrong,Bishop Armstrong,,48,7,0,"1,000,000 .. 2,000,000",81,0,32,0,299,299,0,0
4,389460,Robot vs Birds Zombies,yfyx games,yfyx games,,35,79,0,"1,000,000 .. 2,000,000",257,0,274,0,99,99,0,0


In [12]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []

    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print(f'Current index: {index}', end='\r')

        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests

    return app_data


def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print(f'Starting at index {begin}:\n')

    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1

    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)

    apps_written = 0
    batch_times = []

    for i in range(len(batches) - 1):
        start_time = time.time()

        start = batches[i]
        stop = batches[i+1]

        app_data = get_app_data(start, stop, parser, pause)

        rel_path = os.path.join(download_path, data_filename)

        # writing app data to file
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')

            for j in range(3,0,-1):
                print(f"\rAbout to write data, don't stop script! ({j})", end='')
                time.sleep(0.5)

            writer.writerows(app_data)
            print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')

        apps_written += len(app_data)

        idx_path = os.path.join(download_path, index_filename)

        # writing last index to file
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)

        # logging time taken
        end_time = time.time()
        time_taken = end_time - start_time

        batch_times.append(time_taken)
        mean_time = statistics.mean(batch_times)

        est_remaining = (len(batches) - i - 2) * mean_time

        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))

        print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))

    print('\nProcessing batches complete. {} apps written'.format(apps_written))

In [5]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

In [6]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


# Set file parameters
download_path = '/content/sample_data'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors'
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
    parser=parse_steam_request,
    app_list=app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index,
    end=28000,
    batchsize=5
)

Starting at index 0:

Exported lines 0-4 to steam_app_data.csv. Batch 0 time: 0:00:08 (avg: 0:00:08, remaining: 0:00:08)
Exported lines 5-9 to steam_app_data.csv. Batch 1 time: 0:00:08 (avg: 0:00:08, remaining: 0:00:00)

Processing batches complete. 10 apps written


In [7]:
# inspect downloaded data
pd.read_csv('/content/sample_data/steam_app_data.csv').head()

Unnamed: 0,type,name,steam_appid,required_age,is_free,controller_support,dlc,detailed_description,about_the_game,short_description,fullgame,supported_languages,header_image,website,pc_requirements,mac_requirements,linux_requirements,legal_notice,drm_notice,ext_user_account_notice,developers,publishers,demos,price_overview,packages,package_groups,platforms,metacritic,reviews,categories,genres,screenshots,movies,recommendations,achievements,release_date,support_info,background,content_descriptors
0,game,Gumboy - Crazy Adventures™,2520,0,False,,,Gumboy has fun and novel gameplay set in a ric...,Gumboy has fun and novel gameplay set in a ric...,Gumboy has fun and novel gameplay set in a ric...,,"English, Polish, Russian",https://cdn.akamai.steamstatic.com/steam/apps/...,http://www.gumboycrazyadventures.com/,{'minimum': '<strong>Minimum:</strong> Windows...,[],[],,,,"['CINEMAX, s.r.o.']","['CINEMAX, s.r.o.']",,"{'currency': 'USD', 'initial': 499, 'final': 2...",[234],"[{'name': 'default', 'title': 'Buy Gumboy - Cr...","{'windows': True, 'mac': False, 'linux': False}","{'score': 69, 'url': 'https://www.metacritic.c...","<h2 class=""bb_tag"">GameTunnel - 8/10 and 2006 ...","[{'id': 2, 'description': 'Single-player'}]","[{'id': '4', 'description': 'Casual'}, {'id': ...","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,,,"{'coming_soon': False, 'date': 'Dec 19, 2006'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
1,game,X: Tension,2850,0,False,,,X-Tension is the eagerly awaited expansion pac...,X-Tension is the eagerly awaited expansion pac...,X-Tension is the eagerly awaited expansion pac...,,"English<strong>*</strong>, German<strong>*</st...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://www.egosoft.com/games/x_tension/info_en...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],[],,,,['Egosoft'],['Egosoft'],,"{'currency': 'USD', 'initial': 499, 'final': 9...",[6354],"[{'name': 'default', 'title': 'Buy X: Tension'...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '28', 'description': 'Simulation'}, {'...","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...","[{'id': 5968, 'name': 'X Superbox Trailer', 't...",,,"{'coming_soon': False, 'date': 'Oct 8, 2010'}","{'url': 'www.egosoft.com', 'email': 'support@e...",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
2,game,X-COM: Interceptor,7730,0,False,,,They're back...only this time the heinous alie...,They're back...only this time the heinous alie...,They're back...only this time the heinous alie...,,English,https://cdn.akamai.steamstatic.com/steam/apps/...,,"{'minimum': '<ul class=""bb_ul""><li><strong>Sup...",[],[],,,,"['MicroProse Software, Inc']",['2K'],,"{'currency': 'USD', 'initial': 499, 'final': 4...","[962, 964]","[{'name': 'default', 'title': 'Buy X-COM: Inte...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '2', 'description': 'Strategy'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,{'total': 124},,"{'coming_soon': False, 'date': 'Sep 4, 2008'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
3,game,Mata Hari,18480,0,False,,,TRUST NO ONE BUT YOURSELF as you adventure int...,TRUST NO ONE BUT YOURSELF as you adventure int...,TRUST NO ONE BUT YOURSELF as you adventure int...,,"English, German, Italian, French",https://cdn.akamai.steamstatic.com/steam/apps/...,,"{'minimum': '<ul class=""bb_ul""><li><strong>OS:...",[],[],,,,['4Head Studios'],['Viva Media'],,"{'currency': 'USD', 'initial': 999, 'final': 9...",[1906],"[{'name': 'default', 'title': 'Buy Mata Hari',...","{'windows': True, 'mac': False, 'linux': False}","{'score': 63, 'url': 'https://www.metacritic.c...",,"[{'id': 2, 'description': 'Single-player'}]","[{'id': '25', 'description': 'Adventure'}]","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,,,"{'coming_soon': False, 'date': 'Jul 29, 2009'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"
4,game,Mayhem Intergalactic,18600,0,False,,,Wage war on your friends and enemies in this s...,Wage war on your friends and enemies in this s...,Wage war on your friends and enemies in this s...,,English,https://cdn.akamai.steamstatic.com/steam/apps/...,http://www.inventivedingo.com/mayhemig,"{'minimum': '<ul class=""bb_ul""><li><strong>Ope...",[],[],,,,['Inventive Dingo'],['Inventive Dingo'],"[{'appid': 18610, 'description': ''}]","{'currency': 'USD', 'initial': 999, 'final': 9...",[1316],"[{'name': 'default', 'title': 'Buy Mayhem Inte...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '23', 'description': 'Indie'}, {'id': ...","[{'id': 0, 'path_thumbnail': 'https://cdn.akam...",,,"{'total': 21, 'highlighted': [{'name': 'Natura...","{'coming_soon': False, 'date': 'Jan 22, 2009'}","{'url': '', 'email': ''}",https://cdn.akamai.steamstatic.com/steam/apps/...,"{'ids': [], 'notes': None}"


In [8]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data


# set files and columns
download_path = '/content/sample_data'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    end=28000,
    batchsize=5,
)

Starting at index 0:

Exported lines 0-4 to steamspy_data.csv. Batch 0 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:14)
Exported lines 5-9 to steamspy_data.csv. Batch 1 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:09)
Exported lines 10-14 to steamspy_data.csv. Batch 2 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:05)
Exported lines 15-19 to steamspy_data.csv. Batch 3 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:00)

Processing batches complete. 20 apps written


In [9]:
# inspect downloaded steamspy data
pd.read_csv('/content/sample_data/steamspy_data.csv').head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,2520,Gumboy - Crazy Adventures,"CINEMAX, s.r.o.","CINEMAX, s.r.o.",,61,60,0,"50,000 .. 100,000",11,0,17,0,249,499,50,"English, Polish, Russian","Casual, Indie",0,"{'Casual': 36, 'Indie': 32, 'Physics': 28, 'Ac..."
1,2850,X: Tension,Egosoft,Egosoft,,83,29,0,"200,000 .. 500,000",10,0,10,0,99,499,80,"English, German","Simulation, Strategy",2,"{'Simulation': 57, 'Strategy': 39, 'Space': 36..."
2,7730,X-COM: Interceptor,"MicroProse Software, Inc",2K,,90,78,0,"200,000 .. 500,000",631,0,631,0,499,499,0,English,Strategy,1,"{'Strategy': 42, 'Sci-fi': 22, 'Space': 17, 'S..."
3,18480,Mata Hari,4Head Studios,Viva Media,,70,49,0,"50,000 .. 100,000",16,0,16,0,999,999,0,"English, German, Italian, French",Adventure,1,"{'Adventure': 42, 'Point & Click': 30, 'Female..."
4,18600,Mayhem Intergalactic,Inventive Dingo,Inventive Dingo,,13,12,0,"50,000 .. 100,000",0,0,0,0,999,999,0,English,"Indie, Strategy",0,"{'PvP': 130, 'Turn-Based Strategy': 124, '4X':..."
