## 1. Data Collection


### Import neccessary libraries

In [21]:
import pandas as pd
import numpy as np
import re
import time
import asyncio
import json
import os
from typing import Optional, Dict, Any, List
import requests
import aiohttp  # Async HTTP requests
import nest_asyncio # Nested async loops in Jupyter


### 1.1 SteamSpy API Data collection and processing

Since the official steam API returns all applications, not just games, and don't filter for spam apps, we will use SteamSpy as an alternative to get a quality list of games and information not found on the official API.


In [30]:
nest_asyncio.apply()

async def fetch_steamspy_page(session, page):
    # Fetch data from a single SteamSpy page
    base_url = 'https://steamspy.com/api.php'
    params = {'request': 'all', 'page': page}
    try:
        async with session.get(base_url, params=params) as response:
            if response.status == 200:
                return list((await response.json()).values())
            else:
                print(f"SteamSpy API error on page {page}: HTTP {response.status}")
                return []
    except Exception as e:
        print(f"SteamSpy API exception on page {page}: {e}")
        return []

async def fetch_all_steamspy_data(max_pages):
    # Fetch data from multiple SteamSpy pages asynchronously for better runtime
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_steamspy_page(session, page) for page in range(max_pages)]
        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()
        all_games = [game for page_games in results for game in page_games]
        print(f"Fetched {len(all_games)} games from SteamSpy in {end_time - start_time:.2f} seconds")
        return pd.DataFrame(all_games)

def get_steamspy_data(max_pages):
    # Manage asyncio event loop and call the fetch all function
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
    return loop.run_until_complete(fetch_all_steamspy_data(max_pages))

max_pages_steamspy = 21 # 1000 games per page
steamspy_df = get_steamspy_data(max_pages_steamspy)
steamspy_df.to_csv('data/steamspy_data.csv', index=False)
print("Saved steamspy_data.csv")

Fetched 21000 games from SteamSpy in 6.00 seconds
Saved steamspy_data.csv


### 1.2 Steam API Data Collection and Processing

Since most Vietnamese ISPs blocked Steam DNS, a VPN is required to scrape data from Steam.

In [28]:
def fetch_steam_store_app_details(app_id):
    url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
    try:
        response = requests.get(url)  # Added timeout
        response.raise_for_status()
        data = response.json()[str(app_id)]
        return data['data'] if data.get('success') and 'data' in data else None
    except (requests.exceptions.RequestException, KeyError, ValueError) as e:
        print(f"Steam Store API error for app ID {app_id}: {e}")
        return None

def fetch_all_steam_store_data(app_ids, output_filename="data/steamstore_data.csv", save_interval=100):
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    
    unique_app_ids = set(app_ids)
    processed_app_ids = set()
    steamstore_data = []
    
    try:
        existing_df = pd.read_csv(output_filename)
        # Remove duplicates from existing data
        existing_df = existing_df.drop_duplicates(subset=['appid'], keep='last')
        processed_app_ids = set(existing_df['appid'].tolist())
        steamstore_data = existing_df.to_dict('records')
        print(f"Loaded {len(processed_app_ids)} unique apps from existing file.")
    except FileNotFoundError:
        print("Starting new data collection.")
    
    # Calculate remaining apps to process
    remaining_app_ids = unique_app_ids - processed_app_ids
    total_remaining = len(remaining_app_ids)
    
    if total_remaining == 0:
        print("All apps already processed.")
        return pd.DataFrame(steamstore_data)
    
    print(f"Processing {total_remaining} remaining apps...")
    
    last_request_time = 0
    rate_limit_delay = 1.5
    
    for i, app_id in enumerate(remaining_app_ids):
        # Rate limiting
        current_time = time.time()
        time_since_last_request = current_time - last_request_time
        if time_since_last_request < rate_limit_delay:
            time.sleep(rate_limit_delay - time_since_last_request)
        
        data = fetch_steam_store_app_details(app_id)
        last_request_time = time.time()
        
        if data:
            app_data = {
                'appid': data.get('steam_appid'),
                'name': data.get('name'),
                'languages': data.get('supported_languages'),
                'metacritic': data.get('metacritic', {}).get('score') if isinstance(data.get('metacritic'), dict) else data.get('metacritic'),
                'genres': ','.join([g['description'] for g in data.get('genres', [])]) if data.get('genres') else None,
                'release_date': data.get('release_date', {}).get('date'),
                'required_age': data.get('required_age'),
                'dlc': ','.join(map(str, data.get('dlc', []))) if data.get('dlc') else None
            }
            steamstore_data.append(app_data)
        
        # Save progress at intervals
        if (i + 1) % save_interval == 0 or (i + 1) == total_remaining:
            df = pd.DataFrame(steamstore_data)
            # Ensure no duplicates when saving
            df = df.drop_duplicates(subset=['appid'], keep='last')
            df.to_csv(output_filename, index=False)
            print(f"\rProcessed {i + 1}/{total_remaining} remaining games. Total unique games: {len(df)}", end="", flush=True)
    
    print("\nData collection completed.")
    return pd.DataFrame(steamstore_data)

# Usage
app_ids = steamspy_df['appid'].tolist()
steamstore_df = fetch_all_steam_store_data(app_ids, output_filename="data/steamstore_data.csv", save_interval=100)
print("Saved steamstore_data.csv")

Loaded 9141 unique apps from existing file.
Processing 10371 remaining apps...
Processed 4500/10371 remaining games. Total unique games: 13557Steam Store API error for app ID 2084940: 502 Server Error: Bad Gateway for url: https://store.steampowered.com/api/appdetails?appids=2084940
Steam Store API error for app ID 610380: 502 Server Error: Bad Gateway for url: https://store.steampowered.com/api/appdetails?appids=610380
Steam Store API error for app ID 446540: 502 Server Error: Bad Gateway for url: https://store.steampowered.com/api/appdetails?appids=446540
Steam Store API error for app ID 512080: 502 Server Error: Bad Gateway for url: https://store.steampowered.com/api/appdetails?appids=512080
Processed 10371/10371 remaining games. Total unique games: 19382
Data collection completed.
Saved steamstore_data.csv
