# Data Collection

## Aims
- Collect data from steam
- Collect data from steamspy
- [Dead end] Collect data from steamdb


**Outline:**

- Create applist from steamspy api (steam api includes too many videos and demos)
- Create first steamspy list from 'all' request
- Retrieve individual app data from steam, using applist
- Retrieve individual app data from steamspy, using applist
- Save applist, steamlist, second-steamspy-list to csv files 

**API references:**

- https://partner.steamgames.com/doc/webapi/ISteamApps
- https://steamapi.xpaw.me/#
- https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI
- https://steamspy.com/api.php

**Useful links:**

- https://steamdb.info/app/271590/graphs/  
- https://steamdb.info/apps/page3/

In [2]:
# load extensions and magics

# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

# http://raw.github.com/jrjohansson/version_information/master/version_information.py
%load_ext version_information
%version_information numpy, pandas, requests

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information


Software,Version
Python,3.7.3 64bit [MSC v.1915 64 bit (AMD64)]
IPython,7.5.0
OS,Windows 10 10.0.17763 SP0
numpy,1.16.3
pandas,0.24.2
requests,2.21.0
Wed May 01 17:50:51 2019 GMT Summer Time,Wed May 01 17:50:51 2019 GMT Summer Time


In [3]:
# Standard library
import csv
import datetime as dt
import json
import os
import statistics
import time

# Third party imports
import numpy as np
import pandas as pd
import requests

In [66]:
# define initial function
def get_request(url, parameters=None):
    
    response = requests.get(url=url, params=parameters)
    
    if response:
        return response.json()
    else:
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)


# request 'all' from steam spy
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# no longer using this data
# steam_spy_all.to_csv('data/steam_spy_all.csv', index=False)
# steam_spy_all

# generate app_list from steamspy data, to be used for retrieving individual app data from steam
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)
app_list.to_csv('data/app_list.csv', index=False)

# reading app_list from file for consistency
app_list = pd.read_csv('data/app_list.csv')
app_list

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount
10,10,Counter-Strike,Valve,Valve,,124534,3339,0,"10,000,000 .. 20,000,000",17612,709,317,26,999,999,0
1000080,1000080,神明在上(Zengeon),IndieLeague Studio,IndieLeague Studio,,350,59,0,"0 .. 20,000",0,0,0,0,899,899,0
1000100,1000100,干支セトラ　陽ノ卷｜干支etc.　陽之卷,七月九日,Starship Studio,,10,4,0,"0 .. 20,000",0,0,0,0,1299,1299,0
1000110,1000110,Jumping Master(跳跳大咖),重庆环游者网络科技,重庆环游者网络科技,,18,6,0,"0 .. 20,000",0,0,0,0,0,0,0
1000130,1000130,Cube Defender,Simon Codrington,Simon Codrington,,2,0,0,"0 .. 20,000",0,0,0,0,299,299,0
1000370,1000370,SurReal Subway,LFiO Studio,LFiO Studio,,0,1,0,"0 .. 20,000",0,0,0,0,399,399,0
1000380,1000380,Rogue Reaper,Fireroot Studios,Fireroot Studios,,214,69,0,"50,000 .. 100,000",0,0,0,0,0,0,0
1000460,1000460,Arsonist,EvilCoGames,EvilCoGames,,0,0,0,"0 .. 20,000",0,0,0,0,0,0,0
1000480,1000480,Battle Motion,Meadow Games,Meadow Games,,8,2,0,"0 .. 20,000",0,0,0,0,999,999,0
1000510,1000510,The Marvellous Machine,1337 Game Design,1337 Game Design,,16,7,0,"0 .. 20,000",0,0,0,0,299,299,0


In [None]:
def get_app_data(start, stop, parser, pause):
        
    app_data = []

    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        data = parser(appid)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
                
    return app_data


def process_chunks(parser, app_list, data_filename, index_filename, columns, begin=0, end=-1, chunksize=100, pause=1):
    print('Starting at index {}:\n'.format(begin))
    
    if end == -1:
        end = len(app_list) + 1
        
    chunks = np.arange(begin, end, chunksize)
    chunks = np.append(chunks, end)
    
    apps_written = 0
    chunk_times = []
    
    for i in range(len(chunks) - 1):
        start_time = time.time()
        
        start = chunks[i]
        stop = chunks[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join('data', data_filename)
        
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            print('About to write data (3)', end='\r')
            time.sleep(0.5)
            print('About to write data (2)', end='\r')
            time.sleep(0.5)
            print('About to write data (1)', end='\r')
            time.sleep(0.5)
            
            writer.writerows(app_data)
            print('Exported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join('data', index_filename)
        
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        end_time = time.time()
        time_taken = end_time - start_time
        
        chunk_times.append(time_taken)
        mean_time = statistics.mean(chunk_times)
        
        est_remaining = (len(chunks) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Chunk {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing chunks complete. {} apps written'.format(apps_written))

    
def reset_index(index_filename):
    rel_path = os.path.join('data', index_filename)
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(index_filename):
    try:
        rel_path = os.path.join('data', index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def create_data_file(filename, index):
    if index == 0:
        rel_path = os.path.join('data', filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

        
def parse_steam_request(appid):
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data

            
app_list = pd.read_csv('data/app_list.csv')

steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'
steam_columns = ['type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support', 'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame', 'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice', 'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups', 'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots', 'movies', 'recommendations', 'achievements', 'release_date', 'support_info', 'background', 'content_descriptors']

# temporarily overwrite last_index everytime
# reset_index(steam_index)

index = get_index(steam_index)

# Wipe or create data file and write headers if index is 0
# create_data_file(steam_app_data, index)

process_chunks(
    parser=parse_steam_request,
    app_list=app_list,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index
)

Starting at index 5300:

Exported lines 5300-5399 to steam_app_data.csv. Chunk 0 time: 0:02:53 (avg: 0:02:53, remaining: 11:30:23)
Exported lines 5400-5499 to steam_app_data.csv. Chunk 1 time: 0:02:54 (avg: 0:02:54, remaining: 11:29:06)
Exported lines 5500-5599 to steam_app_data.csv. Chunk 2 time: 0:02:56 (avg: 0:02:55, remaining: 11:29:25)
Current index: 5622

# Old code

In [146]:
def get_steam_app_info(appid):
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


def get_steam_app_data(start=0, stop=-1):
    
    if stop == -1:
        stop = len(app_list) + 1
        
    app_data = []
    all_times = []

    for index, row in app_list[start:stop].iterrows():
        start_time = time.time()
        
        appid = row['appid']
        name = row['name']

        data = get_steam_app_info(appid)
        app_data.append(data)

        time.sleep(1) # prevent overloading api with requests
        
        end_time = time.time()
        time_taken = end_time - start_time
        
        all_times.append(time_taken)    
        mean_time = statistics.mean(all_times)

        # print('Complete: {}, Total: {}, Todo: {}'.format(index+1-start, stop-start, (stop-start)-(index+1-start)))
        total = stop - start
        complete = index + 1 - start
        todo = total - complete
        
        est_remaining = todo * mean_time
        remaining_time = dt.timedelta(seconds=round(est_remaining))
            
        template = '  {} of {}: appid {}, idx {}. this: {:.2}s, avg: {:.2}s, left: {}'
        print(template.format(complete+start, total+start, appid, index, time_taken, mean_time, remaining_time), end='\r')
            
    total_time_taken = dt.timedelta(seconds=sum(all_times))
    template = '\n  Finished chunk {}-{}. Time taken: {}. Apps retrieved: {}'
    print(template.format(start, stop, total_time_taken, len(app_data)))
        
    return app_data


def process_chunks(filename, last_index_filename, begin=0, end=-1, chunksize=100):
    print('Beginning at index {}:'.format(begin))
    
    if end == -1:
        end = len(app_list) + 1
        
    chunks = np.arange(begin, end, chunksize)
    chunks = np.append(chunks, end)
    
    apps_written = 0
    chunk_times = []
    
    for i in range(len(chunks) - 1):
        start_time = time.time()
        
        start = chunks[i]
        stop = chunks[i+1]
        
        app_data = get_steam_app_data(start, stop)
        
        rel_path = os.path.join('data', filename)
        
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            writer.writerows(app_data)
            print('  Successfully wrote apps (idx) {} to {}.'.format(start, stop-1))
            
        apps_written += len(app_data)
        
        rel_path = os.path.join('data', last_index_filename)
        
        with open(rel_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        end_time = time.time()
        time_taken = end_time - start_time
        
        chunk_times.append(time_taken)
        mean_time = statistics.mean(chunk_times)
        
        est_remaining = (len(chunks) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('\nChunk {}, time: {}, avg: {}, remaining: {}\n'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing chunks complete. {} apps written'.format(apps_written))

    
# individual app data from steam
filename = 'steam_app_data.csv'
last_index_filename = 'steam_last_index.txt'
columns = ['type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support', 'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame', 'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice', 'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups', 'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots', 'movies', 'recommendations', 'achievements', 'release_date', 'support_info', 'background', 'content_descriptors']


def reset_last_index():
    rel_path = os.path.join('data', last_index_filename)
    with open(rel_path, 'w') as f:
        print(0, file=f)


# temporarily overwrite last_index everytime
# reset_last_index()

try:
    rel_path = os.path.join('data', last_index_filename)
    
    with open(rel_path, 'r') as f:
        index = int(f.readline())
except FileNotFoundError:
    index = 0

if index == 0:
    rel_path = os.path.join('data', filename)
    
    with open(rel_path, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writeheader()

process_chunks(filename, last_index_filename, begin=index)

Beginning at index 5100:
  5102 of 5200: appid 343560, idx 5101. this: 1.7s, avg: 1.7s, left: 0:02:43

KeyboardInterrupt: 

In [None]:
# get list of apps and app ids
response = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v2/")
json_data = response.json()
app_list = json_data['applist']['apps']

# display first five apps
app_list[:5]

In [None]:
# convert to pandas dataframe and export to csv
steam_app_ids = pd.DataFrame(app_list)

# disabled for consistency
# steam_app_ids.to_csv('data/steam_app_ids.csv', index=False)

# reading from csv for consistency
steam_app_ids = pd.read_csv('data/steam_app_ids.csv')

# sorting values to make downloading easier
steam_app_ids = steam_app_ids.sort_values('appid').reset_index(drop=True)
steam_app_ids

In [None]:
# get individual app information from steam

def get_app_info(app_id, attempt=0):
    error_id = None
    data = None
    
    url = "http://store.steampowered.com/api/appdetails/"
    
    parameters = {"appids": app_id}
    response = requests.get(url, params=parameters)
    
    if response:
        # print('responded')
        json_data = response.json()[str(app_id)]
        
        if json_data['success']:
            # print('data found')
            data = json_data['data']
        else:
            # print('no app found: {}'.format(app_id))
            error_id = app_id
    else:
        print('no response, trying again in {} seconds (appid: {})'.format(attempt, app_id))
        time.sleep(attempt)
        data, error_id = get_app_info(app_id, attempt=10)
        
    return data, error_id
        

def steam_scrape(start=0, stop=-1, verbose=False):
    if stop == -1:
        stop = len(steam_app_ids) + 1
    
    app_data = []
    errors = []
    all_times = []
    
    for index, row in steam_app_ids[start:stop].iterrows():
        if verbose:
            start_time = time.time()
        
        app_id = row['appid']
        name = row['name']

        data, error_id = get_app_info(app_id)

        if data:
            app_data.append(data)
        if error_id:
            errors.append(error_id)
                    
        if verbose:
            end_time = time.time()
            time_taken = end_time - start_time
            all_times.append(time_taken)
            mean_time = sum(all_times) / len(all_times)

            est_remaining = ((stop - (index-start)) * mean_time)
            remaining_time = dt.timedelta(seconds=est_remaining)
            
            template = 'appid {}, idx {}. this: {:.2}s, avg: {:.2}s, left: {}.'
            print(template.format(app_id, index, time_taken, mean_time, remaining_time))
            
    if verbose:
        total_time_taken = dt.timedelta(seconds=sum(all_times))
        
        template = '\nFinished! Total time taken: {}. Apps: {}, errors: {}.'
        print(template.format(total_time_taken, len(app_data), len(errors)))
        
    return app_data, errors


first_10, first_10_errors = steam_scrape(stop=10, verbose=True)
pd.DataFrame(first_10)

In [None]:
# first_1000_data, first_1000_errors = steam_scrape(start=0, stop=1000, verbose=True)

print('Finished.')

In [None]:
columns = [
    'type',
    'name',
    'steam_appid',
    'required_age',
    'is_free',
    'controller_support',
    'dlc',
    'detailed_description',
    'about_the_game',
    'short_description',
    'fullgame',
    'supported_languages',
    'header_image',
    'website',
    'pc_requirements',
    'mac_requirements',
    'linux_requirements',
    'legal_notice',
    'drm_notice',
    'ext_user_account_notice',
    'developers',
    'publishers',
    'demos',
    'price_overview',
    'packages',
    'package_groups',
    'platforms',
    'metacritic',
    'reviews',
    'categories',
    'genres',
    'screenshots',
    'movies',
    'recommendations',
    'achievements',
    'release_date',
    'support_info',
    'background',
    'content_descriptors',
    'alternate_appid'
]

steam_app_data_0_1000 = pd.DataFrame(first_1000_data, columns=columns)
steam_app_data_0_1000.to_csv('data/steam_app_data_0_1000.csv', index=False)

steam_app_data_0_1000

In [None]:
def process_chunks(begin=0, end=-1, chunksize=100, pause=10, verbose=False):
    if end == -1:
        end = len(steam_app_ids) + 1
        
    chunks = np.arange(begin, end, chunksize)
    chunks = np.append(chunks, end)
    
    # print(chunks)
    
    all_data = []
    all_errors = []
    
    for i in range(len(chunks) - 1):
        start = chunks[i]
        stop = chunks[i+1]
        
        # print(start, stop)
    
        app_data, errors = steam_scrape(start, stop, verbose=verbose)
    
        if app_data:
            all_data = list(itertools.chain(all_data, app_data))
        if errors:
            all_errors = list(itertools.chain(all_errors, errors))
        
        if verbose:
            print('\nPausing {} seconds...'.format(pause))
        
        time.sleep(pause)
        
        if verbose:
            print('Resuming.\n')
    
    return all_data, all_errors

data, errors = process_chunks(begin=0, end=10, chunksize=2, pause=1, verbose=True)

pd.DataFrame(data, columns=columns)

In [None]:
with open('data/steam_app_data_test.csv', 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    
    writer.writeheader()
    
with open('data/steam_app_data_test.csv', 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writerow(data[0])
    writer.writerow(data[1])
    
with open('scraping/last_index.txt', 'w') as f:
    index = 515
    print(index, file=f)

with open('scraping/last_index.txt', 'r') as f:
    index = f.readline()

print('index:', index)



In [None]:
def process_chunks(filename, begin=0, end=-1, chunksize=100, pause=10, verbose=False):
    if end == -1:
        end = len(steam_app_ids) + 1
        
    chunks = np.arange(begin, end, chunksize)
    chunks = np.append(chunks, end)
    
    # print(chunks)
    
    # all_data = []
    # all_errors = []
    
    for i in range(len(chunks) - 1):
        start = chunks[i]
        stop = chunks[i+1]
        
        # print(start, stop)
    
        app_data, errors = steam_scrape(start, stop, verbose=verbose)
    
        if app_data:
            # all_data = list(itertools.chain(all_data, app_data))     
            with open(filename, 'a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=columns)
                writer.writerows(app_data)
        
        if errors:
            # all_errors = list(itertools.chain(all_errors, errors))
            with open('scraping/errors.txt', 'a', newline='') as f:
                writer = csv.writer(f)
                timestamp = dt.datetime.today().strftime('%Y/%m/%d %H:%M:%S')
                writer.writerow([timestamp, filename, errors])
        
        with open('scraping/last_index.txt', 'w') as f:
            index = stop
            print(index, file=f)
        
        if verbose:
            print('\nPausing {} seconds...'.format(pause))
        
        time.sleep(pause)
        
        if verbose:
            print('Resuming.\n')
    
    return 'Processing chunks complete' # all_data, all_errors

filename = 'data/steam_app_data_test.csv'

with open('scraping/last_index.txt', 'r') as f:
    try:
        index = int(f.readline())
    except ValueError:
        index = 0
        
    if index == 0:
        with open('scraping/errors.txt', 'w') as f:
            pass
        
        with open(filename, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
    
print('Starting index:', index)

# process_chunks(filename, begin=index, end=index+10, chunksize=2, pause=1, verbose=True)

# return total apps written to file to verify?

In [None]:
filename = 'data/steam_app_data.csv'

with open('scraping/last_index.txt', 'r') as f:
    try:
        index = int(f.readline())
    except ValueError:
        index = 0
        
    if index == 0:
        with open('scraping/errors.txt', 'w') as f:
            pass
        
        with open(filename, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
    
print('Starting index:', index)

process_chunks(filename, begin=index, chunksize=100, pause=60, verbose=True)

# idx 6200

In [None]:
# csv_input = pd.read_csv('data/steam_app_data.csv')
# csv_input['alternate_appid'] = ''
# csv_input.to_csv('data/steam_app_data2.csv')

In [None]:
t = dt.datetime.today()
timestamp = '{}/{}/{} {}:{}:{}'.format(t.year, t.month, t.day, t.hour, t.minute, t.second)
timestamp

timestamp = dt.datetime.today().strftime('%Y/%m/%d %H:%M:%S')
timestamp

In [None]:
# convert to dataframe and export to csv
steam_app_data = pd.DataFrame(steam_app_list)
steam_app_data.to_csv('data/steam_app_data.csv', index=False)
steam_app_data

In [None]:
# request 'all' from steam spy
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}
response = requests.get(url=url, params=parameters)
json_data = response.json()

steam_spy = pd.DataFrame.from_dict(json_data, orient='index')
steam_spy

In [None]:
# request individual apps from steam spy
url = "https://steamspy.com/api.php"

steam_spy_list = []

for app_id in app_ids[:20]:
    try:
        parameters = {"request": "appdetails", "appid": app_id}
        response = requests.get(url=url, params=parameters)
        steam_spy_list.append(response.json())
        time.sleep(0.5)
    except:
        pass
    
steam_spy_individual = pd.DataFrame(steam_spy_list)
steam_spy_individual

# steamspy

In [None]:
def get_request(url, parameters=None):
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        for i in range(5, 0, -1):
            print('Waiting... ({})'.format(i), end='\r')
            time.sleep(1)
        print('Retrying.          ')
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)


def get_app_data(start, stop, parser, pause):
        
    app_data = []

    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
                
    return app_data


def process_chunks(parser, app_list, data_filename, index_filename, columns, begin=0, end=-1, chunksize=100, pause=1):
    print('Starting at index {}:\n'.format(begin))
    
    if end == -1:
        end = len(app_list) + 1
        
    chunks = np.arange(begin, end, chunksize)
    chunks = np.append(chunks, end)
    
    apps_written = 0
    chunk_times = []
    
    for i in range(len(chunks) - 1):
        start_time = time.time()
        
        start = chunks[i]
        stop = chunks[i+1]
        
        app_data = get_app_data(start, stop, parser, pause)
        
        rel_path = os.path.join('data', data_filename)
        
        with open(rel_path, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')
            
            print('About to write data (3)', end='\r')
            time.sleep(0.5)
            print('About to write data (2)', end='\r')
            time.sleep(0.5)
            print('About to write data (1)', end='\r')
            time.sleep(0.5)
            
            writer.writerows(app_data)
            print('Exported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')
            
        apps_written += len(app_data)
        
        idx_path = os.path.join('data', index_filename)
        
        with open(idx_path, 'w') as f:
            index = stop
            print(index, file=f)
            
        end_time = time.time()
        time_taken = end_time - start_time
        
        chunk_times.append(time_taken)
        mean_time = statistics.mean(chunk_times)
        
        est_remaining = (len(chunks) - i - 2) * mean_time
        
        remaining_td = dt.timedelta(seconds=round(est_remaining))
        time_td = dt.timedelta(seconds=round(time_taken))
        mean_td = dt.timedelta(seconds=round(mean_time))
        
        print('Chunk {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))
            
    print('\nProcessing chunks complete. {} apps written'.format(apps_written))


def reset_index(index_filename):
    rel_path = os.path.join('data', index_filename)
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(index_filename):
    try:
        rel_path = os.path.join('data', index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(filename, index):
    if index == 0:
        rel_path = os.path.join('data', filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()


In [None]:
# will copy

def parse_steamspy_request(appid, name):
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data

            
# get list of apps to retrieve
app_list = pd.read_csv('data/app_list.csv')

# set files and columns
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'
steamspy_columns = ['appid', 'name', 'developer', 'publisher', 'score_rank', 'positive', 'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks', 'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount', 'languages', 'genre', 'ccu', 'tags']
            
# TESTING: temporarily overwrite last index everytime
# reset_index(steamspy_index)

# get stored index
index = get_index(steamspy_index)

# WARNING: wipes data file if index is 0
# prepare_data_file(steamspy_data, index)

# retrive and write data to file
process_chunks(
    parser=parse_steamspy_request,
    app_list=app_list,
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    pause=0.3
)