In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
games = pd.read_csv("/content/drive/MyDrive/Steam Game Dataset/games.csv")
recommendations = pd.read_csv("/content/drive/MyDrive/Steam Game Dataset/recommendations.csv", usecols=["app_id", "user_id", "is_recommended"], dtype={"app_id": "int32", "user_id": "int32", "is_recommended": "bool"})
users = pd.read_csv("/content/drive/MyDrive/Steam Game Dataset/users.csv", dtype={"user_id": "int32", "products": "int16", "reviews": "int16"})

# Check loaded data
print("Games Shape:", games.shape)
print("Recommendations Shape (Full):", recommendations.shape)
print("Users Shape:", users.shape)

Games Shape: (50872, 13)
Recommendations Shape (Full): (41154794, 3)
Users Shape: (14306064, 3)


In [None]:
def optimize_memory(df):
    for col in df.select_dtypes(include=["int", "float"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer" if df[col].dtype == "int" else "float")
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].astype("category")
    return df

# Optimize all datasets
games = optimize_memory(games)
recommendations = optimize_memory(recommendations)
users = optimize_memory(users)

# Check memory usage
print("Games Memory Usage (MB):", games.memory_usage(deep=True).sum() / (1024**2))
print("Recommendations Memory Usage (MB):", recommendations.memory_usage(deep=True).sum() / (1024**2))
print("Users Memory Usage (MB):", users.memory_usage(deep=True).sum() / (1024**2))

Games Memory Usage (MB): 8.11091423034668
Recommendations Memory Usage (MB): 353.23455238342285
Users Memory Usage (MB): 109.146728515625


In [None]:
# Sample 10% of recommendations and users
sampled_recommendations = recommendations.sample(frac=0.1, random_state=42)
sampled_users = users.sample(frac=0.1, random_state=42)

# Check sampled data
print("Sampled Recommendations Shape:", sampled_recommendations.shape)
print("Sampled Users Shape:", sampled_users.shape)

Sampled Recommendations Shape: (4115479, 3)
Sampled Users Shape: (1430606, 3)


In [None]:
# Merge games and sampled recommendations on app_id
games_recommendations = pd.merge(sampled_recommendations, games, on="app_id", how="inner")

# Merge the resulting DataFrame with sampled users on user_id
merged_data = pd.merge(games_recommendations, sampled_users, on="user_id", how="inner")

# Display merged data info
print("Merged Data Shape:", merged_data.shape)
print(merged_data.head())

Merged Data Shape: (411465, 17)
     app_id  is_recommended     user_id                     title  \
0  601150.0            True   6991933.0           Devil May Cry 5   
1     400.0            True  11264359.0                    Portal   
2  870780.0           False  11757719.0  Control Ultimate Edition   
3  552100.0            True  14248104.0                Brick Rigs   
4   49520.0            True   5719610.0             Borderlands 2   

  date_release   win    mac  linux                   rating  positive_ratio  \
0   2019-03-07  True  False  False  Overwhelmingly Positive              95   
1   2007-10-10  True   True   True  Overwhelmingly Positive              98   
2   2020-08-27  True  False  False            Very Positive              88   
3   2023-07-14  True  False  False            Very Positive              93   
4   2012-09-17  True   True   True  Overwhelmingly Positive              95   

   user_reviews  price_final  price_original  discount  steam_deck  products  

In [None]:
# Save merged data to CSV
merged_data.to_csv("merged_data_sampled.csv", index=False)
print("Merged dataset saved!")

Merged dataset saved!
