# Data Extraction proecess

In [80]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

In [119]:
# Create the link for steamspy api 
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# create an empty list to store the first 10 pages of games
# The games are stored in steamspy API by the order of populartiy
steam_spy_list=[]

for p in range(10):
    parameters = {"request": "all","page":p}
    json_data = requests.get(url=url, params=parameters).json()
    print("---------------------------------------------------")
    print(f"Data extracted from page {p}")
    steam_spy_tmp = pd.DataFrame.from_dict(json_data, orient='index')
    steam_spy_list.append(steam_spy_tmp)

# Compile the data together to a new dataframe
steam_spy_all = pd.concat(steam_spy_list,ignore_index=True)
steam_spy_all

---------------------------------------------------
Data extracted from page 0
---------------------------------------------------
Data extracted from page 1
---------------------------------------------------
Data extracted from page 2
---------------------------------------------------
Data extracted from page 3
---------------------------------------------------
Data extracted from page 4
---------------------------------------------------
Data extracted from page 5
---------------------------------------------------
Data extracted from page 6
---------------------------------------------------
Data extracted from page 7
---------------------------------------------------
Data extracted from page 8
---------------------------------------------------
Data extracted from page 9


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
0,570,Dota 2,Valve,Valve,,1231949,228624,0,"100,000,000 .. 200,000,000",34674,1535,1013,718,0,0,0,509412
1,730,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,,4698693,633657,0,"50,000,000 .. 100,000,000",29044,974,8180,407,0,0,0,1030379
2,440,Team Fortress 2,Valve,Valve,,713187,44917,0,"50,000,000 .. 100,000,000",8786,1208,430,362,0,0,0,104499
3,578080,PLAYERUNKNOWN'S BATTLEGROUNDS,"KRAFTON, Inc.","KRAFTON, Inc.",,901900,767516,0,"50,000,000 .. 100,000,000",25442,672,11647,245,2999,2999,0,227673
4,304930,Unturned,Smartly Dressed Games,Smartly Dressed Games,,384630,37019,0,"20,000,000 .. 50,000,000",5104,1817,340,840,0,0,0,25080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,38450,MDK,Shiny Entertainment,Interplay Inc.,,192,55,0,"20,000 .. 50,000",1,0,1,0,999,999,0,0
9996,599000,Pizza Connection 2,Assemble Entertainment,Assemble Entertainment,,102,40,0,"20,000 .. 50,000",0,0,0,0,599,599,0,0
9997,827780,Margot's Word Brain,Slam Productions Ltd,Funbox Media Ltd,,1,3,0,"20,000 .. 50,000",0,0,0,0,299,299,0,0
9998,377070,Noctropolis,"Flashpoint Studios, Nightdive Studios",Nightdive Studios,,39,7,0,"20,000 .. 50,000",308,0,308,0,999,999,0,0


In [186]:
# export the dataframe to Data folder
app_list = steam_spy_all
app_list.to_csv('Data/app_list.csv', index=False)
app_list = pd.read_csv('Data/app_list.csv')
app_list

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
0,570,Dota 2,Valve,Valve,,1231949,228624,0,"100,000,000 .. 200,000,000",34674,1535,1013,718,0,0,0,509412
1,730,Counter-Strike: Global Offensive,"Valve, Hidden Path Entertainment",Valve,,4698693,633657,0,"50,000,000 .. 100,000,000",29044,974,8180,407,0,0,0,1030379
2,440,Team Fortress 2,Valve,Valve,,713187,44917,0,"50,000,000 .. 100,000,000",8786,1208,430,362,0,0,0,104499
3,578080,PLAYERUNKNOWN'S BATTLEGROUNDS,"KRAFTON, Inc.","KRAFTON, Inc.",,901900,767516,0,"50,000,000 .. 100,000,000",25442,672,11647,245,2999,2999,0,227673
4,304930,Unturned,Smartly Dressed Games,Smartly Dressed Games,,384630,37019,0,"20,000,000 .. 50,000,000",5104,1817,340,840,0,0,0,25080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,38450,MDK,Shiny Entertainment,Interplay Inc.,,192,55,0,"20,000 .. 50,000",1,0,1,0,999,999,0,0
9996,599000,Pizza Connection 2,Assemble Entertainment,Assemble Entertainment,,102,40,0,"20,000 .. 50,000",0,0,0,0,599,599,0,0
9997,827780,Margot's Word Brain,Slam Productions Ltd,Funbox Media Ltd,,1,3,0,"20,000 .. 50,000",0,0,0,0,299,299,0,0
9998,377070,Noctropolis,"Flashpoint Studios, Nightdive Studios",Nightdive Studios,,39,7,0,"20,000 .. 50,000",308,0,308,0,999,999,0,0


In [81]:
# Create a function to continuously retrieve data from http://store.steampowered.com/api/appdetails/.
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*1)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 10 seconds...')
        time.sleep(10)
        print('Retrying.')
        return get_request(url, parameters)

In [165]:
def steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    data.pop('required_age',None)
    data.pop('controller_support ',None)
    data.pop('dlc',None)
    data.pop('detailed_description',None)
    data.pop('about_the_game',None)
    data.pop('short_description',None)
    data.pop('fullgame',None)
    data.pop('supported_languages',None)
    data.pop('reviews',None)
    data.pop('header_image',None)
    data.pop('website',None)
    data.pop('pc_requirements',None)
    data.pop('mac_requirements',None)
    data.pop('linux_requirements',None)
    data.pop('legal_notice',None)
    data.pop('demos',None)
    data.pop('packages',None)
    data.pop('package_groups',None)
    data.pop('platforms',None)
    data.pop('metacritic',None)
    data.pop('screenshots',None)
    data.pop('movies',None)
    data.pop('background',None)
    data.pop('support_info',None)
    data.pop('content_descriptors',None)
        
    return data

In [180]:
app_data = []
print(len(app_list))
# iterate through each row of app_list, confined by start and stop
for index, row in app_list.iterrows():
    print('Current index: {}'.format(index), end='\r')
    appid = row['appid']
    name = row['name']

    # retrive app data for a row, handled by supplied parser, and append to list
    data = steam_request(appid, name)
    app_data.append(data)
    time.sleep(1)

10000
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
No response, waiting 10 seconds...
Retrying.
Current index: 9999

In [187]:
# Write the data to a json file due to the unstructure.
with open('Data/app_data.json', 'w') as outfile:
    json.dump(app_data, outfile)

In [193]:
# examine the data file
with open('Data/app_data.json') as json_file:
    data = json.load(json_file)

data[9999]

{'type': 'game',
 'name': 'Construction Machines 2014',
 'steam_appid': 252050,
 'is_free': False,
 'developers': ['GameCask'],
 'publishers': ['GameCask'],
 'price_overview': {'currency': 'AUD',
  'initial': 995,
  'final': 995,
  'discount_percent': 0,
  'initial_formatted': '',
  'final_formatted': 'A$ 9.95'},
 'categories': [{'id': 2, 'description': 'Single-player'}],
 'genres': [{'id': '28', 'description': 'Simulation'}],
 'recommendations': {'total': 239},
 'release_date': {'coming_soon': False, 'date': '28 Mar, 2014'}}