# Data Collection

### Import neccessary libraries

In [21]:
import requests
import pandas as pd
import time
import aiohttp
import numpy as np
from typing import List, Dict
import asyncio
import nest_asyncio
import platform
import json
import os
import sys

## 1. SteamSpy API Data collection and processing

Since the official steam API returns all applications, not just games, and don't filter for spam apps, we will use SteamSpy as an alternative to get a quality list of games and information not found on the official API.


In [23]:
nest_asyncio.apply()

async def fetch_page(session: aiohttp.ClientSession, page: int) -> List[Dict]:
    base_url = 'https://steamspy.com/api.php'
    params = {
        'request': 'all',
        'page': page
    }
    try:
        async with session.get(base_url, params=params) as response:
            if response.status == 200:
                data = await response.json()
                # Extract the dictionary values from data.items()
                return list(data.values())
            else:
                print(f"Error on page {page}: HTTP {response.status}")
                return []
    except Exception as e:
        print(f"Exception on page {page}: {str(e)}")
        return []

async def fetch_all_games(max_pages: int) -> pd.DataFrame:
    async with aiohttp.ClientSession() as session:
        tasks = [
            fetch_page(session, page) 
            for page in range(0, max_pages)
        ]
        
        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()
        
        all_games = [
            game 
            for page_games in results 
            for game in page_games
        ]
        
        print(f"\nFetched {len(all_games)} games in {end_time - start_time:.2f} seconds")
        return pd.DataFrame(all_games)

def fetch_steamspy(max_pages: int) -> pd.DataFrame:
    try:
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        
        games_df = loop.run_until_complete(fetch_all_games(max_pages))
        
        return games_df
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return pd.DataFrame()


games_df = fetch_steamspy(10)
# Save to CSV
games_df.to_csv('steamspy_data.csv', index=False)
print(f"Saved steamspy_data.csv with {len(games_df)} games.")


Fetched 10000 games in 0.30 seconds
Saved steamspy_data.csv with 10000 games.


In [5]:
# Most popular games
#print(games_df.nlargest(10, 'estimated_owners')[['name', 'estimated_owners']])
# Games with longest median playtime
#print(games_df.nlargest(10, 'median_playtime')[['name', 'median_playtime']])

## Steam API Data Collection and Processing

Since most Vietnamese ISPs blocked Steam DNS, a VPN is required to scrape data from Steam.

In [24]:
def get_steam_app_details(app_id):
    url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise HTTPError for bad responses
        data = response.json()
        return data[str(app_id)]  # Return the app details for the given app ID
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for app ID {app_id}: {e}")
        return None

app_ids = games_df['appid'].tolist()

# Fetch and store app details
steamstore_data = []
total_apps = len(app_ids)
for i, app_id in enumerate(app_ids):
    app_details = get_steam_app_details(app_id)
    if app_details and 'data' in app_details:  # Check if 'data' key exists
        data = app_details['data']
        selected_data = {
            'appid': data.get('steam_appid'),
            'name': data.get('name'),
            'languages': data.get('supported_languages'),
            'metacritic': data.get('metacritic'),
            'genres': data.get('genres')
        }
        steamstore_data.append(selected_data)
    else:
        print(f"No 'data' found for app ID {app_id}. Details: {app_details}")

    if (i + 1) % 10 == 0:
        progress_message = f"Processed {i + 1} out of {total_apps} apps."
        sys.stdout.write('\r' + progress_message)
        sys.stdout.flush()
    time.sleep(1.5) # API rate limit

# Save data to steamstore_data.csv
if steamstore_data:
    df_store = pd.DataFrame(steamstore_data)
    df_store.to_csv("steamstore_data.csv", index=False)
    print("Data saved to steamstore_data.csv")
else:
    print("No data fetched.")

Processed 10 out of 10000 apps.No 'data' found for app ID 1599340. Details: {'success': False}
No 'data' found for app ID 553850. Details: {'success': False}
Processed 30 out of 10000 apps.No 'data' found for app ID 238960. Details: {'success': False}
Processed 190 out of 10000 apps.No 'data' found for app ID 208090. Details: {'success': False}
Processed 240 out of 10000 apps.No 'data' found for app ID 39210. Details: {'success': False}
Processed 280 out of 10000 apps.No 'data' found for app ID 601510. Details: {'success': False}
Processed 380 out of 10000 apps.No 'data' found for app ID 905370. Details: {'success': False}
Processed 410 out of 10000 apps.No 'data' found for app ID 2322010. Details: {'success': False}
No 'data' found for app ID 235460. Details: {'success': False}
Processed 420 out of 10000 apps.No 'data' found for app ID 550900. Details: {'success': False}
Processed 440 out of 10000 apps.No 'data' found for app ID 286940. Details: {'success': False}
No 'data' found for 