**21127671 - Nguyễn Hoàng Phúc**

## **Steam Games Analysis Project**


### Import neccessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import time
import asyncio
import json
from typing import List, Dict
import requests
import aiohttp  # Async HTTP requests
import nest_asyncio # Nested async loops in Jupyter

Note: you may need to restart the kernel to use updated packages.


## 1. Data Collection

### 1.1 SteamSpy API Data collection and processing

Since the official steam API returns all applications, not just games, and don't filter for spam apps, we will use SteamSpy as an alternative to get a quality list of games and information not found on the official API.


In [140]:
nest_asyncio.apply()

async def fetch_steamspy_page(session, page):
    # Fetch data from a single SteamSpy page
    base_url = 'https://steamspy.com/api.php'
    params = {'request': 'all', 'page': page}
    try:
        async with session.get(base_url, params=params) as response:
            if response.status == 200:
                return list((await response.json()).values())
            else:
                print(f"SteamSpy API error on page {page}: HTTP {response.status}")
                return []
    except Exception as e:
        print(f"SteamSpy API exception on page {page}: {e}")
        return []

async def fetch_all_steamspy_data(max_pages):
    # Fetch data from multiple SteamSpy pages asynchronously for better runtime
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_steamspy_page(session, page) for page in range(max_pages)]
        start_time = time.time()
        results = await asyncio.gather(*tasks)
        end_time = time.time()
        all_games = [game for page_games in results for game in page_games]
        print(f"Fetched {len(all_games)} games from SteamSpy in {end_time - start_time:.2f} seconds")
        return pd.DataFrame(all_games)

def get_steamspy_data(max_pages):
    # Manage asyncio event loop and call the fetch all function
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
    return loop.run_until_complete(fetch_all_steamspy_data(max_pages))

max_pages_steamspy = 10 # 1000 games per page
steamspy_df = get_steamspy_data(max_pages_steamspy)
steamspy_df.to_csv('steamspy_data.csv', index=False)
print("Saved steamspy_data.csv")

### 1.2 Steam API Data Collection and Processing

Since most Vietnamese ISPs blocked Steam DNS, a VPN is required to scrape data from Steam.

In [141]:
def fetch_steam_store_app_details(app_id):
    # Fetch app details from Steam Store API
    url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()[str(app_id)]
        if data.get('success') and 'data' in data:
            return data['data']
        else:
            print(f"No data found for app ID {app_id} in Steam Store. Details: {data}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Steam Store API error for app ID {app_id}: {e}")
        return None

def fetch_all_steam_store_data(app_ids):
    # Fetch data for multiple apps from Steam Store API
    steamstore_data = []
    total_apps = len(app_ids)
    for i, app_id in enumerate(app_ids):
        data = get_steam_store_app_details(app_id)
        if data:
            steamstore_data.append({
                'appid': data.get('steam_appid'),
                'name': data.get('name'),
                'languages': data.get('supported_languages'),
                'metacritic': data.get('metacritic'),
                'genres': data.get('genres')
            })
        if (i + 1) % 100 == 0 or (i + 1) == total_apps:
            print(f"\rProcessed {i + 1}/{total_apps} games from Steam Store", end="", flush=True)
        time.sleep(1.5)  # API rate limits
    print() 
    return pd.DataFrame(steamstore_data)

app_ids = steamspy_df['appid'].tolist()

# Because of Steam rate limits, 10000 games take 4 hours to fetch so function call is commented out

# steamstore_df = get_all_steam_store_data(app_ids)
# steamstore_df.to_csv("steamstore_data.csv", index=False)
# print("Saved steamstore_data.csv")

## 2. Data Preprocessing


In [142]:
steamspy_df = pd.read_csv('steamspy_data.csv')
steamstore_df = pd.read_csv('steamstore_data.csv')

print(steamspy_df.head())

     appid                              name         developer  \
0      570                            Dota 2             Valve   
1      730  Counter-Strike: Global Offensive             Valve   
2   578080               PUBG: BATTLEGROUNDS  PUBG Corporation   
3  1172470                      Apex Legends           Respawn   
4      440                   Team Fortress 2             Valve   

         publisher  score_rank  positive  negative  userscore  \
0            Valve         NaN   1922076    434033          0   
1            Valve         NaN   7300141   1096624          0   
2    KRAFTON, Inc.         NaN   1448418   1011202          0   
3  Electronic Arts         NaN    651799    311422          0   
4            Valve         NaN    995536    128698          0   

                       owners  average_forever  average_2weeks  \
0  200,000,000 .. 500,000,000            41092            1541   
1  100,000,000 .. 200,000,000            31730             755   
2  100,000,000

In [143]:
print(steamstore_df.head())

     appid                 name  \
0      570               Dota 2   
1      730     Counter-Strike 2   
2   578080  PUBG: BATTLEGROUNDS   
3  1172470        Apex Legends™   
4      440      Team Fortress 2   

                                           languages  \
0  Bulgarian, Czech, Danish, Dutch, English<stron...   
1  Czech, Danish, Dutch, English<strong>*</strong...   
2  English, Korean, Simplified Chinese, French, G...   
3  English<strong>*</strong>, French<strong>*</st...   
4  English<strong>*</strong>, Danish, Dutch, Finn...   

                                          metacritic  \
0  {'score': 90, 'url': 'https://www.metacritic.c...   
1                                                NaN   
2                                                NaN   
3  {'score': 88, 'url': 'https://www.metacritic.c...   
4  {'score': 92, 'url': 'https://www.metacritic.c...   

                                              genres  
0  [{'id': '1', 'description': 'Action'}, {'id': ...  
1  [{

In [144]:
merged_df = pd.merge(steamspy_df, steamstore_df, on=['appid', 'name'], how='inner')
print(merged_df.isnull().sum())
merged_df = merged_df.dropna(subset=['languages','metacritic','genres'])

appid                 0
name                  0
developer            29
publisher            38
score_rank         9050
positive              0
negative              0
userscore             0
owners                0
average_forever       0
average_2weeks        0
median_forever        0
median_2weeks         0
price                 0
initialprice          0
discount              0
ccu                   0
languages             2
metacritic         6643
genres               24
dtype: int64


In [145]:
# Function to extract the Metacritic score from json string
def extract_metacritic_score(metacritic):
    if pd.isnull(metacritic):
        return None
    try:
        if isinstance(metacritic, str):
            metacritic = eval(metacritic)
        return metacritic.get('score')
    except:
        return None

# Function to count the number of languages
def count_languages(languages_str):
    if pd.isnull(languages_str) or not isinstance(languages_str, str):
        return 0
    languages_str = re.sub('<[^<]+?>', '', languages_str)
    return len(languages_str.split(','))

# Extract genres into a list
def extract_genres(genre_list):
    if pd.isnull(genre_list):
        return []
    try:
        # Convert string representation to list if necessary
        if isinstance(genre_list, str):
            genre_list = eval(genre_list)
        return [genre['description'].lower() for genre in genre_list]
    except:
        return []
    
# Apply the functions
merged_df['metacritic'] = merged_df['metacritic'].apply(extract_metacritic_score)
merged_df['languages'] = merged_df['languages'].apply(count_languages)

merged_df['genre_list'] = merged_df['genres'].apply(extract_genres)
# List of top genres to create boolean columns for
top_genres = ['indie', 'action', 'casual', 'action-adventure', 'simulation',
              'rpg', 'strategy', 'sports', 'racing', 'massively multiplayer']
# Create boolean columns for each top genre
for genre in top_genres:
    merged_df[f'genre_{genre}'] = merged_df['genre_list'].apply(lambda x: int(genre in x))
    
selected_columns = ['appid', 'name', 'metacritic', 'median_forever', 'price', 'languages']
selected_columns += [f'genre_{genre}' for genre in top_genres]
# Create the final DataFrame
processed_df = merged_df[selected_columns]
print(processed_df.head())


    appid                name  metacritic  median_forever  price  languages  \
0     570              Dota 2          90             838      0         28   
2     440     Team Fortress 2          92             340      0         27   
6  271590  Grand Theft Auto V          96            6253   2430         13   
7     550       Left 4 Dead 2          89             492    200         28   
9  230410            Warframe          69             455      0         14   

   genre_indie  genre_action  genre_casual  genre_action-adventure  \
0            0             1             0                       0   
2            0             1             0                       0   
6            0             1             0                       0   
7            0             1             0                       0   
9            0             1             0                       0   

   genre_simulation  genre_rpg  genre_strategy  genre_sports  genre_racing  \
0                 0       