In [1]:
import requests
import random
import re
import pandas as pd
import numpy as np
import time
from IPython.display import clear_output


In [2]:
def get_all_app_id():
    # get all app id
    req = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v2/")

    if (req.status_code != 200):
        print("Failed to get all games on steam.")
        return
    
    try:
        data = req.json()
    except Exception as e:
        print(e)
        return
    
    apps_data = data['applist']['apps']

    apps_ids = []

    for app in apps_data:
        appid = app['appid']
        name = app['name']
        
        # skip apps that have empty name
        if not name:
            continue

        apps_ids.append(appid)

    return apps_ids

In [3]:
app_id = get_all_app_id()
app_id = sorted(app_id)
len(app_id)


221301

In [4]:
def clean_html(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    # remove multiple spaces
    cleantext = re.sub('\s+', ' ', cleantext)
    return cleantext

In [5]:
def error_list_concat(app_id,code):
    df = pd.read_csv('error_list.csv')
    print('error func',app_id,code)

    temp = pd.DataFrame(data=[[app_id, code]], columns=['app_id', 'error_code'])
    temp = temp.astype({'app_id': int, 'error_code': str})
    df = pd.concat([df, temp], ignore_index=True)
    df.to_csv('error_list.csv', index=False)
    
    return

In [6]:
def get_app_details(app_id):
    
    url = "http://store.steampowered.com/api/appdetails/"
    params = {
        "appids": app_id
    }
    req = requests.get(url, params=params)

    if (req.status_code == 429):
        print(f"Too many requests")
        time.sleep(10)
        return get_app_details(app_id)
    elif (req.status_code != 200):
        print(f"Failed to get app details for app_id: {app_id}")
        error_list_concat(app_id,str(req.status_code))
        return None
        
    try:
        data = req.json()
    except Exception as e:
        print('error',e)
        error_list_concat(app_id,'json error')
        return None
    
    try:
        name = data[str(app_id)]['data']['name']
        about_game = clean_html(data[str(app_id)]['data']['about_the_game'])
        short_description = clean_html(data[str(app_id)]['data']['short_description'])
        detailed_description = clean_html(data[str(app_id)]['data']['detailed_description'])
        genres = [i['description'] for i in data[str(app_id)]['data']['genres']]
        categories = [i['description'] for i in data[str(app_id)]['data']['categories']]
        df = pd.DataFrame(
            columns=['app_id', 'name', 'about_game', 'short_description', 'detailed_description', 'genres', 'categories'],
            data=[[app_id, name, about_game, short_description, detailed_description, genres, categories]]
        )
    except Exception as e:
        error_list_concat(app_id,'data error')
        print('error',e)

        return None
    
    return df

In [7]:
def scraping_list(app_id_list):
    count = 0
    success = 0
    failed = 0
    df = pd.read_csv('steam.csv')
    exist_game_id = df['app_id'].values
    app_id_list = [app_id for app_id in app_id_list if app_id not in exist_game_id]
    error_list = pd.read_csv('error_list.csv')
    app_id_list = [app_id for app_id in app_id_list if app_id not in error_list['app_id'].values]
    for app_id in app_id_list:
        app_details = get_app_details(app_id)
        if app_details is not None:
            success += 1
            df = pd.concat([df, app_details], ignore_index=True)
        else:
            failed += 1
        df.to_csv('steam.csv', index=False)
        clear_output(wait=True)
        count += 1
        print(f"Progress: {count}/{len(app_id_list)}")
        print(f"Success: {success}")
        print(f"Failed: {failed}")

In [8]:
scraping_list(app_id[:1000])

Progress: 200/377
Success: 41
Failed: 159
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests
Too many requests


KeyboardInterrupt: 

In [None]:
scraping_list(app_id[1000:10000])

In [None]:
scraping_list(app_id[10000:100000])

In [None]:
scraping_list(app_id[100000:])