## 1. Data Collection


### Import neccessary libraries

In [None]:
import pandas as pd
import numpy as np
=import re
import time
import asyncio
import json
from typing import List, Dict
import requests
import aiohttp  # Async HTTP requests
import nest_asyncio # Nested async loops in Jupyter


### 1.1 SteamSpy API Data collection and processing

Since the official steam API returns all applications, not just games, and don't filter for spam apps, we will use SteamSpy as an alternative to get a quality list of games and information not found on the official API.


In [None]:
nest_asyncio.apply()

async def fetch_steamspy_page(session, page):
    # Fetch data from a single SteamSpy page
    base_url = 'https://steamspy.com/api.php'
    params = {'request': 'all', 'page': page}
    try:
        async with session.get(base_url, params=params) as response:
            if response.status == 200:
                return list((await response.json()).values())
            else:
                print(f"SteamSpy API error on page {page}: HTTP {response.status}")
                return []
    except Exception as e:
        print(f"SteamSpy API exception on page {page}: {e}")
        return []

async def fetch_all_steamspy_data(max_pages):
    # Fetch data from multiple SteamSpy pages asynchronously for better runtime
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_steamspy_page(session, page) for page in range(max_pages)]
        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()
        all_games = [game for page_games in results for game in page_games]
        print(f"Fetched {len(all_games)} games from SteamSpy in {end_time - start_time:.2f} seconds")
        return pd.DataFrame(all_games)

def get_steamspy_data(max_pages):
    # Manage asyncio event loop and call the fetch all function
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
    return loop.run_until_complete(fetch_all_steamspy_data(max_pages))

# max_pages_steamspy = 10 # 1000 games per page
# steamspy_df = get_steamspy_data(max_pages_steamspy)
# steamspy_df.to_csv('data/steamspy_data.csv', index=False)
# print("Saved steamspy_data.csv")

### 1.2 Steam API Data Collection and Processing

Since most Vietnamese ISPs blocked Steam DNS, a VPN is required to scrape data from Steam.

In [None]:
def fetch_steam_store_app_details(app_id):
    # Fetch app details from Steam Store API
    url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()[str(app_id)]
        if data.get('success') and 'data' in data:
            return data['data']
        else:
            print(f"No data found for app ID {app_id} in Steam Store. Details: {data}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Steam Store API error for app ID {app_id}: {e}")
        return None

def fetch_all_steam_store_data(app_ids):
    # Fetch data for multiple apps from Steam Store API
    steamstore_data = []
    total_apps = len(app_ids)
    for i, app_id in enumerate(app_ids):
        data = get_steam_store_app_details(app_id)
        if data:
            steamstore_data.append({
                'appid': data.get('steam_appid'),
                'name': data.get('name'),
                'languages': data.get('supported_languages'),
                'metacritic': data.get('metacritic'),
                'genres': data.get('genres')
            })
        if (i + 1) % 100 == 0 or (i + 1) == total_apps:
            print(f"\rProcessed {i + 1}/{total_apps} games from Steam Store", end="", flush=True)
        time.sleep(1.5)  # API rate limits
    print() 
    return pd.DataFrame(steamstore_data)

#app_ids = steamspy_df['appid'].tolist()

# Because of Steam rate limits, 10000 games take 4 hours to fetch so function call is commented out

# steamstore_df = get_all_steam_store_data(app_ids)
# steamstore_df.to_csv("data/steamstore_data.csv", index=False)
# print("Saved steamstore_data.csv")