ETL OF Steam Datasets

In [1]:
import pandas as pd
from datasets import load_dataset
ds_huggingface = load_dataset('FronkonGames/steam-games-dataset', split='train')
df_huggingface = ds_huggingface.to_pandas()

# to csv
df_huggingface.to_csv('hugging_face.csv', index=False)

print('HuggingFace Steam Games:', getattr(df_huggingface, 'shape', None))
df_huggingface.head()

  from .autonotebook import tqdm as notebook_tqdm


HuggingFace Steam Games: (83560, 39)


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [2]:
df_steam_trends = pd.read_csv("https://docs.google.com/spreadsheets/d/1D5MErWbFJ2Gsde9QxJ_HNMltKfF6fHCYdv4OQpXdnZ4/export?format=csv&gid=352042934")

# to csv
df_steam_trends.to_csv('steam_trends_2023.csv', index=False)

print("Steam Trends 2023:", df_steam_trends.shape)
df_steam_trends.head()

Steam Trends 2023: (65111, 14)


Unnamed: 0,App ID,Title,Reviews Total,Reviews Score Fancy,Release Date,Reviews D7,Reviews D30,Reviews D90,Launch Price,Tags,name_slug,Revenue Estimated,Modified Tags,Steam Page
0,730,Counter-Strike: Global Offensive,7382695,88%,2012-08-21,,,,"$14,99","FPS, Shooter, Multiplayer, Competitive, Action...",,"$110 666 598,05","FPS_, Shooter_, Multiplayer_, Competitive_, Ac...",https://store.steampowered.com/app/730
1,578080,PUBG: BATTLEGROUNDS,2201296,57%,2017-12-21,,,,"$29,99","Survival, Shooter, Battle Royale, Multiplayer,...",,"$66 016 867,04","Survival_, Shooter_, Battle Royale_, Multiplay...",https://store.steampowered.com/app/578080
2,570,Dota 2,2017009,82%,2013-07-09,,,,"$29,99","Free to Play, MOBA, Multiplayer, Strategy, eSp...",,"$60 490 099,91","Free to Play_, MOBA_, Multiplayer_, Strategy_,...",https://store.steampowered.com/app/570
3,271590,Grand Theft Auto V,1322782,"89,85%",2015-04-13,,,,"$29,99","Open World, Action, Multiplayer, Crime, Automo...",,"$39 670 232,18","Open World_, Action_, Multiplayer_, Crime_, Au...",https://store.steampowered.com/app/271590
4,359550,Tom Clancy's Rainbow Six® Siege,978762,86%,2015-12-01,,,,"$59,99","FPS, PvP, eSports, Shooter, Multiplayer, Tacti...",,"$58 715 932,38","FPS_, PvP_, eSports_, Shooter_, Multiplayer_, ...",https://store.steampowered.com/app/359550


Modified web scraper code from https://github.com/lundkvistbenjamin/steam-sales-scraper/blob/main/steam_scraper.py

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from IPython.display import display, clear_output

titles, ratings, reviews, discounts, prices, original_prices, release_dates = [], [], [], [], [], [], []
win, lin, osx, fetch_times = [], [], [], []

rating_system = ["Overwhelmingly Negative", "Very Negative", "Negative", "Mostly Negative",
                 "Mixed", "Mostly Positive", "Positive", "Very Positive", "Overwhelmingly Positive"]

def parse_price(price_str):
    if price_str:
        clean_price = re.sub(r"[^\d.,]", "", price_str)
        return float(clean_price.replace(",", ""))
    return None

current_time = datetime.now().strftime("%Y-%m-%d %H:%M")

for page_number in range(1, 400):
    res = requests.get(f"https://store.steampowered.com/search/?supportedlang=english&specials=1&page={page_number}&ndl=1")
    soup = BeautifulSoup(res.text, "html.parser")
    game_containers = soup.find_all("div", {"class": "responsive_search_name_combined"})
    
    for game in game_containers:
        # Title
        title = game.find("span", {"class": "title"})
        titles.append(title.text if title else None)
        
        # Rating & Reviews
        rating_span = game.find("span", {"class": "search_review_summary"})
        if rating_span:
            data_tooltip = rating_span.get("data-tooltip-html", "")
            rating = data_tooltip.split("<br>")[0]
            ratings.append(rating_system.index(rating) if rating in rating_system else None)
            reviews.append(data_tooltip.split("<br>")[1].split(" ")[3] if "<br>" in data_tooltip else None)
        else:
            ratings.append(None)
            reviews.append(None)
        
        # Discount
        discount = game.find("div", {"class": "discount_pct"})
        discounts.append(int(discount.text.strip("%")) if discount else None)
        
        # Price
        price = game.find("div", {"class": "discount_final_price"})
        prices.append(parse_price(price.text) if price else None)
        
        # Original Price
        original_price = game.find("div", {"class": "discount_original_price"})
        original_prices.append(parse_price(original_price.text) if original_price else None)
        
        # Release Date
        release = game.find("div", {"class": "search_released"})
        release_dates.append(release.text.strip() if release and len(release.text) > 2 else None)
        
        # Platforms
        platforms = [platform["class"][1] for platform in game.find_all("span", {"class": "platform_img"})]
        win.append(1 if "win" in platforms else 0)
        lin.append(1 if "linux" in platforms else 0)
        osx.append(1 if "mac" in platforms else 0)
        
        # Fetch Time
        fetch_times.append(current_time)
    
    clear_output(wait=True)
    display(f"Loading page {page_number}...")

data = {
    "Game Name": titles,
    "Rating": ratings,
    "#Reviews": reviews,
    "Discount%": discounts,
    "Price (Php)": prices,
    "Original Price (Php)": original_prices,
    "Release Date": release_dates,
    "Windows": win,
    "Linux": lin,
    "MacOS": osx,
    "Fetched At": fetch_times
}

df_steam_sales = pd.DataFrame(data)
print("Steam Sales Data:", df_steam_sales.shape)
df_steam_sales.to_csv('steam_sales.csv', index=False)

print(df_steam_sales.head())

'Loading page 399...'

Steam Sales Data: (9975, 11)
            Game Name  Rating #Reviews  Discount%  Price (Php)  \
0            Dispatch     7.0    3,378      -10.0       764.10   
1  Escape From Duckov     8.0    1,934      -12.0       650.69   
2                Rust     7.0  501,576      -40.0       540.00   
3       RV There Yet?     7.0    1,567      -10.0       247.45   
4    Dead by Daylight     5.0  236,863      -60.0       295.99   

   Original Price (Php)  Release Date  Windows  Linux  MacOS        Fetched At  
0                849.00  22 Oct, 2025        1      0      0  2025-10-24 16:41  
1                739.43  16 Oct, 2025        1      0      1  2025-10-24 16:41  
2                900.00   8 Feb, 2018        1      0      1  2025-10-24 16:41  
3                274.95  21 Oct, 2025        1      0      0  2025-10-24 16:41  
4                739.99  14 Jun, 2016        1      0      0  2025-10-24 16:41  


In [4]:
df_steam_sales.head()

Unnamed: 0,Game Name,Rating,#Reviews,Discount%,Price (Php),Original Price (Php),Release Date,Windows,Linux,MacOS,Fetched At
0,Dispatch,7.0,3378,-10.0,764.1,849.0,"22 Oct, 2025",1,0,0,2025-10-24 16:41
1,Escape From Duckov,8.0,1934,-12.0,650.69,739.43,"16 Oct, 2025",1,0,1,2025-10-24 16:41
2,Rust,7.0,501576,-40.0,540.0,900.0,"8 Feb, 2018",1,0,1,2025-10-24 16:41
3,RV There Yet?,7.0,1567,-10.0,247.45,274.95,"21 Oct, 2025",1,0,0,2025-10-24 16:41
4,Dead by Daylight,5.0,236863,-60.0,295.99,739.99,"14 Jun, 2016",1,0,0,2025-10-24 16:41


Clean hugging_face steam games data

In [5]:
import numpy as np

df_huggingface = pd.read_csv('hugging_face.csv')

print('Original shape:', df_huggingface.shape)
print('Columns count:', len(df_huggingface.columns))

# Drop not needed columns
patterns = [
    'required age','dlc count','about the game', 'reviews','genres',
    'supported languages','full audio languages','header image', 'price',
    'header email','support email','support url','email','website',
    'metacritic url','score rank','achievements','recommendations',
    'developers','publishers','categories','screenshots','movies', 'notes'
]
drop_cols = [c for c in df_huggingface.columns if any(p in c.lower() for p in patterns)]
if drop_cols:
    df_huggingface = df_huggingface.drop(columns=drop_cols)
print('Dropped columns:', drop_cols)
print('Shape after drop:', df_huggingface.shape)

release_cols = [c for c in df_huggingface.columns if 'release' in c.lower()]
for c in release_cols:
    df_huggingface[c] = pd.to_datetime(df_huggingface[c], errors='coerce')
    df_huggingface[c] = df_huggingface[c].dt.strftime('%Y-%m-%d')
    df_huggingface[c] = df_huggingface[c].fillna('')

# Drop rows missing important vals
cols_for_drop = ['appid','name','release date']
app_cols = [c for c in df_huggingface.columns if any(tok in c.lower().replace(' ', '') for tok in cols_for_drop)]
app_cols = list(dict.fromkeys(app_cols))
if app_cols:
    df_huggingface[app_cols] = df_huggingface[app_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_huggingface.shape[0]
    df_huggingface = df_huggingface.dropna(subset=app_cols, how='all')
    after_rows = df_huggingface.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null (checked cols: {app_cols})')

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_huggingface.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_huggingface.columns if c not in num_cols]
for c in num_cols:
    median = df_huggingface[c].median()
    df_huggingface[c] = df_huggingface[c].fillna(median)
for c in obj_cols:
    df_huggingface[c] = df_huggingface[c].fillna('')

# Drop exact duplicates and reset index
before = df_huggingface.shape[0]
df_huggingface = df_huggingface.drop_duplicates().reset_index(drop=True)
after = df_huggingface.shape[0]
print(f'Dropped {before - after} duplicate rows')

# Save cleaned dataset
df_huggingface.to_csv('hugging_face_cleaned.csv', index=False)
print('Cleaned shape:', df_huggingface.shape)

df_huggingface.head()
print(df_huggingface.dtypes)

Original shape: (83560, 39)
Columns count: 39
Dropped columns: ['Required age', 'Price', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url', 'Score rank', 'Achievements', 'Recommendations', 'Notes', 'Developers', 'Publishers', 'Categories', 'Genres', 'Screenshots', 'Movies']
Shape after drop: (83560, 17)
Dropped 0 rows with null (checked cols: ['AppID', 'Name'])
Dropped 0 duplicate rows
Cleaned shape: (83560, 17)
AppID                          int64
Name                          object
Release date                  object
Estimated owners              object
Peak CCU                       int64
Windows                         bool
Mac                             bool
Linux                           bool
Metacritic score               int64
User score                     int64
Positive                       int64
Negative                       int64
Average playtime forever 

Clean Steam Games 2023

In [6]:
df_steam_trends = pd.read_csv('steam_trends_2023.csv')

print('Original shape:', getattr(df_steam_trends, 'shape', None))
print('Columns:', df_steam_trends.columns.tolist())

def _norm(s):
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

# Drop unneeded cols
drop_patterns = [
    'reviews d7', 'reviews d30', 'reviews d90', 'name_slug',
    'revenue estimated', 'modified tags', 'steam page'
]
norm_drop = [_norm(p) for p in drop_patterns]
drop_cols = [c for c in df_steam_trends.columns if any(p in _norm(c) for p in norm_drop)]
if drop_cols:
    df_steam_trends = df_steam_trends.drop(columns=drop_cols, errors='ignore')
print('Dropped columns (if present):', drop_cols)
print('Shape after drop:', df_steam_trends.shape)

release_cols = [c for c in df_steam_trends.columns if 'release' in c.lower()]
for c in release_cols:
    df_steam_trends[c] = pd.to_datetime(df_steam_trends[c], errors='coerce')
    df_steam_trends[c] = df_steam_trends[c].dt.strftime('%Y-%m-%d')
    df_steam_trends[c] = df_steam_trends[c].fillna('')

# Convert reviews score fancy values from % to float decimal
df_steam_trends['Reviews Score Fancy'] = df_steam_trends['Reviews Score Fancy'].str.replace(',', '.', regex=False)
df_steam_trends['Reviews Score Fancy'] = df_steam_trends['Reviews Score Fancy'].str.rstrip('%').astype(float) / 100.0
# Convert launch price to float. Replace ',' with '.', remove non-numeric chars
df_steam_trends['Launch Price'] = df_steam_trends['Launch Price'].str.replace(',', '.', regex=False)
df_steam_trends['Launch Price'] = df_steam_trends['Launch Price'].str.replace('$', '', regex=False)
df_steam_trends['Launch Price'] = pd.to_numeric(df_steam_trends['Launch Price'], errors='coerce')

# Row reduction: drop rows missing important vals
id_tokens = ['app id','title','release date']
id_cols = [c for c in df_steam_trends.columns if any(tok in _norm(c) for tok in [t.replace('_','') for t in id_tokens])]
if id_cols:
    df_steam_trends[id_cols] = df_steam_trends[id_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_steam_trends.shape[0]
    df_steam_trends = df_steam_trends.dropna(subset=id_cols, how='all')
    after_rows = df_steam_trends.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null (checked cols: {id_cols})')

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_steam_trends.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_steam_trends.columns if c not in num_cols]
for c in num_cols:
    median = df_steam_trends[c].median()
    df_steam_trends[c] = df_steam_trends[c].fillna(median)
for c in obj_cols:
    df_steam_trends[c] = df_steam_trends[c].fillna('')

before = df_steam_trends.shape[0]
df_steam_trends = df_steam_trends.drop_duplicates().reset_index(drop=True)
after = df_steam_trends.shape[0]
print(f'Dropped {before - after} duplicate rows')
df_steam_trends.to_csv('steam_trends_cleaned.csv', index=False)
print('Cleaned shape:', df_steam_trends.shape)

df_steam_trends.head()
#print(df_steam_trends.dtypes)

Original shape: (65111, 14)
Columns: ['App ID', 'Title', 'Reviews Total', 'Reviews Score Fancy', 'Release Date', 'Reviews D7', 'Reviews D30', 'Reviews D90', 'Launch Price', 'Tags', 'name_slug', 'Revenue Estimated', 'Modified Tags', 'Steam Page']
Dropped columns (if present): ['Reviews D7', 'Reviews D30', 'Reviews D90', 'name_slug', 'Revenue Estimated', 'Modified Tags', 'Steam Page']
Shape after drop: (65111, 7)
Dropped 0 rows with null (checked cols: ['Title'])
Dropped 0 duplicate rows
Cleaned shape: (65111, 7)


Unnamed: 0,App ID,Title,Reviews Total,Reviews Score Fancy,Release Date,Launch Price,Tags
0,730,Counter-Strike: Global Offensive,7382695,0.88,2012-08-21,14.99,"FPS, Shooter, Multiplayer, Competitive, Action..."
1,578080,PUBG: BATTLEGROUNDS,2201296,0.57,2017-12-21,29.99,"Survival, Shooter, Battle Royale, Multiplayer,..."
2,570,Dota 2,2017009,0.82,2013-07-09,29.99,"Free to Play, MOBA, Multiplayer, Strategy, eSp..."
3,271590,Grand Theft Auto V,1322782,0.8985,2015-04-13,29.99,"Open World, Action, Multiplayer, Crime, Automo..."
4,359550,Tom Clancy's Rainbow Six® Siege,978762,0.86,2015-12-01,59.99,"FPS, PvP, eSports, Shooter, Multiplayer, Tacti..."


Clean steam data sales

In [7]:
df_steam_sales = pd.read_csv('steam_sales.csv')

print('Original shape:', getattr(df_steam_sales, 'shape', None))
print('Columns:', df_steam_sales.columns.tolist())

def _norm(s):
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

# Drop pnot needed cols
drop_patterns = ['discount%', 'rating', 'fetched at']
norm_drop = [re.sub(r'[^a-z0-9]', '', p.lower()) for p in drop_patterns]
drop_cols = [c for c in df_steam_sales.columns if any(p in _norm(c) for p in norm_drop)]
if drop_cols:
    df_steam_sales = df_steam_sales.drop(columns=drop_cols, errors='ignore')
print('Dropped columns (if present):', drop_cols)
print('Shape after drop:', df_steam_sales.shape)

# Format release date
release_cols = [c for c in df_steam_sales.columns if 'release' in c.lower()]
for c in release_cols:
    df_steam_sales[c] = pd.to_datetime(df_steam_sales[c], errors='coerce')
    df_steam_sales[c] = df_steam_sales[c].dt.strftime('%Y-%m-%d')
    df_steam_sales[c] = df_steam_sales[c].fillna('')
print('Formatted release columns:', release_cols)

# drop rows missing important vals
name_tokens = ['game name', 'release date', 'price']
name_cols = [c for c in df_steam_sales.columns if any(tok.replace(' ', '') in _norm(c) for tok in name_tokens)]
if name_cols:
    df_steam_sales[name_cols] = df_steam_sales[name_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_steam_sales.shape[0]
    df_steam_sales = df_steam_sales.dropna(subset=name_cols, how='all')
    after_rows = df_steam_sales.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null important vals (checked cols: {name_cols})')

# Php to $, limit to 2 decimals
df_steam_sales['Price (Php)'] = df_steam_sales['Price (Php)'] * 0.018
df_steam_sales['Original Price (Php)'] = df_steam_sales['Original Price (Php)'] * 0.018
df_steam_sales['Price (Php)'] = df_steam_sales['Price (Php)'].round(2)
df_steam_sales['Original Price (Php)'] = df_steam_sales['Original Price (Php)'].round(2)
df_steam_sales.rename(columns={'Price (Php)': 'Price', 'Original Price (Php)': 'Original Price'}, inplace=True)

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_steam_sales.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_steam_sales.columns if c not in num_cols]
for c in num_cols:
    median = df_steam_sales[c].median()
    df_steam_sales[c] = df_steam_sales[c].fillna(median)
for c in obj_cols:
    df_steam_sales[c] = df_steam_sales[c].fillna('')

# Drop exact duplicates and reset index
before = df_steam_sales.shape[0]
df_steam_sales = df_steam_sales.drop_duplicates().reset_index(drop=True)
after = df_steam_sales.shape[0]
print(f'Dropped {before - after} duplicate rows')

# Save cleaned dataset
df_steam_sales.to_csv('steam_sales_cleaned.csv', index=False)
print('Cleaned shape:', df_steam_sales.shape)

df_steam_sales.head()
print(df_steam_sales.dtypes)

Original shape: (9975, 11)
Columns: ['Game Name', 'Rating', '#Reviews', 'Discount%', 'Price (Php)', 'Original Price (Php)', 'Release Date', 'Windows', 'Linux', 'MacOS', 'Fetched At']
Dropped columns (if present): ['Rating', 'Discount%', 'Fetched At']
Shape after drop: (9975, 8)
Formatted release columns: ['Release Date']
Dropped 0 rows with null important vals (checked cols: ['Game Name', 'Price (Php)', 'Original Price (Php)', 'Release Date'])
Dropped 140 duplicate rows
Cleaned shape: (9835, 8)
Game Name          object
#Reviews           object
Price             float64
Original Price    float64
Release Date       object
Windows             int64
Linux               int64
MacOS               int64
dtype: object


Combining the datasets

In [8]:
# Load all cleaned datasets
df_hf = pd.read_csv('hugging_face_cleaned.csv')
df_sales = pd.read_csv('steam_sales_cleaned.csv')
df_trends = pd.read_csv('steam_trends_cleaned.csv')

# Rename game names/titles
df_sales = df_sales.rename(columns={'Game Name': 'Name'})
df_trends = df_trends.rename(columns={'Title': 'Name'})

# Normalize game names for better matching
def normalize_name(name):
    if not isinstance(name, str):
        return ''
    return (name.lower()
            .replace('®', '')
            .replace('™', '')
            .replace('  ', ' ')
            .strip())

# Create normalized name columns
df_hf['name_norm'] = df_hf['Name'].apply(normalize_name)
df_trends['name_norm'] = df_trends['Name'].apply(normalize_name)
df_sales['name_norm'] = df_sales['Name'].apply(normalize_name)

# Handle duplicates by keeping only the first occurrence of each game
print("Before handling duplicates:")
print(f"Hugging Face rows: {len(df_hf)}")
print(f"Unique names in Hugging Face: {df_hf['name_norm'].nunique()}")
print(f"Trends rows: {len(df_trends)}")
print(f"Unique names in Trends: {df_trends['name_norm'].nunique()}")

# Keep only the first occurrence of each game in both dataframes
df_hf_deduped = df_hf.drop_duplicates(subset=['name_norm'], keep='first')
df_trends_deduped = df_trends.drop_duplicates(subset=['name_norm'], keep='first')
df_sales_deduped = df_sales.drop_duplicates(subset=['name_norm'], keep='first')

print("\nAfter removing duplicates:")
print(f"Hugging Face rows: {len(df_hf_deduped)}")
print(f"Trends rows: {len(df_trends_deduped)}")
print(f"Sales rows: {len(df_sales_deduped)}")

# Find common names after deduplication (intersection of all three)
common_names = set(df_hf_deduped['name_norm']) & set(df_trends_deduped['name_norm']) & set(df_sales_deduped['name_norm'])
print(f"\nNumber of common game names after deduplication: {len(common_names)}")

# Filter all three dataframes to only common names
df_hf_common = df_hf_deduped[df_hf_deduped['name_norm'].isin(common_names)]
df_trends_common = df_trends_deduped[df_trends_deduped['name_norm'].isin(common_names)]
df_sales_common = df_sales_deduped[df_sales_deduped['name_norm'].isin(common_names)]

# Merge all three on name_norm
merged_df = df_hf_common.merge(
    df_trends_common.drop(columns=['Name']),
    on='name_norm',
    how='inner',
    suffixes=('_hf', '_trends')
).merge(
    df_sales_common.drop(columns=['Name']),
    on='name_norm',
    how='inner',
    suffixes=('', '_sales')
)

print("\nAfter merging (should match common names count):")
print(f"Merged dataframe shape: {merged_df.shape}")
print("\nSample of merged data (first 5 rows):")
display(merged_df.head())

# Show duplicate names if any
print("\nChecking for any remaining duplicates in the merged data:")
duplicate_names = merged_df[merged_df.duplicated('name_norm', keep=False)]
if len(duplicate_names) > 0:
    print(f"Found {len(duplicate_names)} rows with duplicate names:")
    display(duplicate_names[['name_norm', 'Name']].sort_values('name_norm'))
else:
    print("No duplicate names found in the merged data.")

# Handle release dates - prefer the first dataset's date if both exist
merged_df['Release date'] = merged_df['Release date'].fillna(merged_df['Release Date'])

# Combine tags from both datasets and remove duplicates
def combine_tags(tags_hf, tags_trends):
    if pd.isna(tags_hf) and pd.isna(tags_trends):
        return ''
    tags = set()
    if pd.notna(tags_hf):
        tags.update(str(tags_hf).split(','))
    if pd.notna(tags_trends):
        tags.update(str(tags_trends).split(','))
    return ','.join(sorted(tags)) if tags else ''

merged_df['Tags'] = merged_df.apply(
    lambda x: combine_tags(x['Tags_hf'], x['Tags_trends']), 
    axis=1
)

# Standardize price column names
merged_df = merged_df.rename(columns={'Price': 'Discounted Price'})

# Clean up - drop date, app id, tags columns
merged_df = merged_df.drop(columns=[
    'Release Date', 
    'Tags_hf',
    'Tags_trends',
    'App ID',
    'AppID'
])

print("\nUpdated columns after combining:")
print(merged_df.columns.tolist())
print("\nSample of combined data with new fields:")
display(merged_df.head())

# to csv
merged_df.to_csv('df_merged_hf_trends.csv', index=False)


Before handling duplicates:
Hugging Face rows: 83560
Unique names in Hugging Face: 82628
Trends rows: 65111
Unique names in Trends: 64597

After removing duplicates:
Hugging Face rows: 82628
Trends rows: 64597
Sales rows: 9811

Number of common game names after deduplication: 2684

After merging (should match common names count):
Merged dataframe shape: (2684, 31)

Sample of merged data (first 5 rows):


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,...,Release Date,Launch Price,Tags_trends,#Reviews,Price,Original Price,Release Date_sales,Windows_sales,Linux_sales,MacOS
0,655370,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,...,2017-10-12,0.99,"Indie, Action, Pixel Graphics, 2D, Retro, Arca...",70,0.32,0.65,2017-10-12,1,0,1
1,575760,Project: R.E.B.O.O.T 2,2017-01-27,20000 - 50000,10,True,False,False,0,0,...,2017-01-27,2.99,"Action, Adventure, Casual, Indie, Platformer, ...",57,0.32,0.65,2017-01-27,1,0,0
2,1557480,Project MIKHAIL: A Muv-Luv War Story,2021-11-01,20000 - 50000,16,True,False,False,0,0,...,2021-11-01,19.99,"Anime, Mechs, Hack and Slash, Strategy, Sci fi...",689,10.46,14.94,2024-04-26,1,0,0
3,768060,The Warrior Of Treasures,2018-01-03,0 - 20000,0,True,False,False,0,0,...,2018-01-03,0.99,"Action, Adventure, Indie",24,0.32,0.65,2018-01-03,1,0,0
4,726020,Mission Ammunition,2017-10-27,0 - 20000,0,True,True,True,0,0,...,2017-10-27,4.99,"Action, Indie, Pixel Graphics, Platformer, Sid...",98,0.77,2.56,2017-10-27,1,1,1



Checking for any remaining duplicates in the merged data:
No duplicate names found in the merged data.

Updated columns after combining:
['Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'name_norm', 'Reviews Total', 'Reviews Score Fancy', 'Launch Price', '#Reviews', 'Discounted Price', 'Original Price', 'Release Date_sales', 'Windows_sales', 'Linux_sales', 'MacOS', 'Tags']

Sample of combined data with new fields:


Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,...,Reviews Score Fancy,Launch Price,#Reviews,Discounted Price,Original Price,Release Date_sales,Windows_sales,Linux_sales,MacOS,Tags
0,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,53,...,0.86,0.99,70,0.32,0.65,2017-10-12,1,0,1,"2D, Action, Arcade, Blood, Casual, Comedy, Co..."
1,Project: R.E.B.O.O.T 2,2017-01-27,20000 - 50000,10,True,False,False,0,0,29,...,0.55,2.99,57,0.32,0.65,2017-01-27,1,0,0,"2D, 2D Platformer, Adventure, Casual, Indie, ..."
2,Project MIKHAIL: A Muv-Luv War Story,2021-11-01,20000 - 50000,16,True,False,False,0,0,328,...,0.56,19.99,689,10.46,14.94,2024-04-26,1,0,0,"Action, Action Adventure, Action RPG, Adventu..."
3,The Warrior Of Treasures,2018-01-03,0 - 20000,0,True,False,False,0,0,31,...,0.77,0.99,24,0.32,0.65,2018-01-03,1,0,0,"Adventure, Indie,Action,Adventure,Indie"
4,Mission Ammunition,2017-10-27,0 - 20000,0,True,True,True,0,0,61,...,0.79,4.99,98,0.77,2.56,2017-10-27,1,1,1,"2D, Difficult, Indie, Pixel Graphics, Platfor..."


In [9]:
print(merged_df.columns.tolist())

['Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'name_norm', 'Reviews Total', 'Reviews Score Fancy', 'Launch Price', '#Reviews', 'Discounted Price', 'Original Price', 'Release Date_sales', 'Windows_sales', 'Linux_sales', 'MacOS', 'Tags']


In [10]:
# Check and print unique tags
def print_unique_tags(df, col_name='Tags'):
    df['tags_list'] = df[col_name].str.split(',')
    tags_exploded = df.explode('tags_list')
    tags_exploded['tags_list'] = tags_exploded['tags_list'].str.strip()
    unique_tags = tags_exploded['tags_list'].dropna().unique()
    print("Total unique tags:", len(unique_tags))
    print("All unique tags:")
    for tag in sorted(unique_tags):
        print(tag)
    return tags_exploded

tags_exploded = print_unique_tags(merged_df)

# Get top 50 most common tags
top_50_series = tags_exploded['tags_list'].value_counts().head(50)
top_50_tags = top_50_series.index.tolist()
print("\nTop 50 most common tags:")
for tag, count in top_50_series.items():
    print(f"{tag}: {count}")

merged_df['tags_list'] = merged_df['Tags'].str.split(',')
merged_df['filtered_tags'] = merged_df['tags_list'].apply(
    lambda tags: [tag.strip() for tag in tags if tag.strip() in top_50_tags]
)
merged_df['filtered_tags'] = merged_df['filtered_tags'].apply(lambda tags: ', '.join(tags))
merged_df['Tags'] = merged_df['filtered_tags']

print_unique_tags(merged_df)
merged_df = merged_df.drop(columns=['tags_list', 'filtered_tags'])

print("\nSample of data after filtering tags to top 50:")
display(merged_df.head())


Total unique tags: 468
All unique tags:
1980s
1990's
1990s
2.5D
2D
2D Fighter
2D Platformer
360 Video
3D
3D Fighter
3D Platformer
3D Vision
4 Player Local
4X
6DOF
8 bit Music
ATV
Abstract
Action
Action Adventure
Action RPG
Action RTS
Action Roguelike
Action-Adventure
Addictive
Adventure
Agriculture
Aliens
Alternate History
Ambient
America
Animation & Modeling
Anime
Arcade
Archery
Arena Shooter
Artificial Intelligence
Assassin
Asymmetric VR
Asynchronous Multiplayer
Atmospheric
Audio Production
Auto Battler
Automation
Automobile Sim
BMX
Base Building
Base-Building
Baseball
Based On A Novel
Basketball
Battle Royale
Beat 'em up
Beat em up
Beautiful
Bikes
Blood
Board Game
Boss Rush
Bowling
Boxing
Building
Bullet Hell
Bullet Time
CRPG
Capitalism
Card Battler
Card Game
Cartoon
Cartoony
Casual
Cats
Character Action Game
Character Customization
Chess
Choices Matter
Choose Your Own Adventure
Cinematic
City Builder
Class Based
Class-Based
Classic
Clicker
Co op
Co op Campaign
Co-op
Co-op Campaign


Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,...,Reviews Score Fancy,Launch Price,#Reviews,Discounted Price,Original Price,Release Date_sales,Windows_sales,Linux_sales,MacOS,Tags
0,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,53,...,0.86,0.99,70,0.32,0.65,2017-10-12,1,0,1,"2D, Action, Arcade, Casual, Comedy, Controller..."
1,Project: R.E.B.O.O.T 2,2017-01-27,20000 - 50000,10,True,False,False,0,0,29,...,0.55,2.99,57,0.32,0.65,2017-01-27,1,0,0,"2D, Adventure, Casual, Indie, Platformer, Sing..."
2,Project MIKHAIL: A Muv-Luv War Story,2021-11-01,20000 - 50000,16,True,False,False,0,0,328,...,0.56,19.99,689,10.46,14.94,2024-04-26,1,0,0,"Action, Adventure, Early Access, RPG, Simulati..."
3,The Warrior Of Treasures,2018-01-03,0 - 20000,0,True,False,False,0,0,31,...,0.77,0.99,24,0.32,0.65,2018-01-03,1,0,0,"Adventure, Indie, Action, Adventure, Indie"
4,Mission Ammunition,2017-10-27,0 - 20000,0,True,True,True,0,0,61,...,0.79,4.99,98,0.77,2.56,2017-10-27,1,1,1,"2D, Difficult, Indie, Pixel Graphics, Platform..."


In [11]:
#Remove unnecessary columns

merged_df = merged_df.drop(columns=[
    'name_norm', 
    'Reviews Total', 
    'Reviews Score Fancy', 
    '#Reviews', 
    'Original Price', 
    'Release Date_sales', 
    'Windows_sales', 
    'Linux_sales', 
    'MacOS'
])





In [12]:
# check price columns
print("\nPrice columns info:")
print(merged_df[['Discounted Price', 'Launch Price']])


Price columns info:
      Discounted Price  Launch Price
0                 0.32          0.99
1                 0.32          2.99
2                10.46         19.99
3                 0.32          0.99
4                 0.77          4.99
...                ...           ...
2679             12.95         29.99
2680              0.51          6.99
2681             15.75         59.99
2682              5.83         59.99
2683              7.02         19.99

[2684 rows x 2 columns]


In [13]:
merged_df = merged_df.rename(columns={'Price (Php)': 'Discounted Price'})

print(merged_df.columns.tolist())

['Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Launch Price', 'Discounted Price', 'Tags']


In [14]:
merged_df.dtypes

Name                           object
Release date                   object
Estimated owners               object
Peak CCU                        int64
Windows                          bool
Mac                              bool
Linux                            bool
Metacritic score                int64
User score                      int64
Positive                        int64
Negative                        int64
Average playtime forever        int64
Average playtime two weeks      int64
Median playtime forever         int64
Median playtime two weeks       int64
Launch Price                  float64
Discounted Price              float64
Tags                           object
dtype: object

In [15]:
# Add ID columns based on row index
merged_df = merged_df.reset_index(drop=True)
merged_df['ReleaseDateID'] = merged_df.index + 1
merged_df['SalesID'] = merged_df.index + 1
merged_df['PlatformsID'] = merged_df.index + 1
merged_df['TagsID'] = merged_df.index + 1
merged_df['PlaytimeID'] = merged_df.index + 1
merged_df['ReviewsID'] = merged_df.index + 1
merged_df['AppID'] = merged_df.index + 1  # Primary key for Games

display(merged_df.head())

Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,...,Launch Price,Discounted Price,Tags,ReleaseDateID,SalesID,PlatformsID,TagsID,PlaytimeID,ReviewsID,AppID
0,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,53,...,0.99,0.32,"2D, Action, Arcade, Casual, Comedy, Controller...",1,1,1,1,1,1,1
1,Project: R.E.B.O.O.T 2,2017-01-27,20000 - 50000,10,True,False,False,0,0,29,...,2.99,0.32,"2D, Adventure, Casual, Indie, Platformer, Sing...",2,2,2,2,2,2,2
2,Project MIKHAIL: A Muv-Luv War Story,2021-11-01,20000 - 50000,16,True,False,False,0,0,328,...,19.99,10.46,"Action, Adventure, Early Access, RPG, Simulati...",3,3,3,3,3,3,3
3,The Warrior Of Treasures,2018-01-03,0 - 20000,0,True,False,False,0,0,31,...,0.99,0.32,"Adventure, Indie, Action, Adventure, Indie",4,4,4,4,4,4,4
4,Mission Ammunition,2017-10-27,0 - 20000,0,True,True,True,0,0,61,...,4.99,0.77,"2D, Difficult, Indie, Pixel Graphics, Platform...",5,5,5,5,5,5,5


In [16]:
# Convert string date column to datetime
merged_df['Release date'] = pd.to_datetime(merged_df['Release date'], errors='coerce')

# Extract year, month, day as integers
merged_df['Year'] = merged_df['Release date'].dt.year.astype('Int64')
merged_df['Month'] = merged_df['Release date'].dt.month.astype('Int64')
merged_df['Day'] = merged_df['Release date'].dt.day.astype('Int64')

# Drop the original string date column
merged_df.drop(columns=['Release date'], inplace=True)

display(merged_df.head())

Unnamed: 0,Name,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,...,ReleaseDateID,SalesID,PlatformsID,TagsID,PlaytimeID,ReviewsID,AppID,Year,Month,Day
0,Train Bandit,0 - 20000,0,True,True,False,0,0,53,5,...,1,1,1,1,1,1,1,2017,10,12
1,Project: R.E.B.O.O.T 2,20000 - 50000,10,True,False,False,0,0,29,28,...,2,2,2,2,2,2,2,2017,1,27
2,Project MIKHAIL: A Muv-Luv War Story,20000 - 50000,16,True,False,False,0,0,328,253,...,3,3,3,3,3,3,3,2021,11,1
3,The Warrior Of Treasures,0 - 20000,0,True,False,False,0,0,31,13,...,4,4,4,4,4,4,4,2018,1,3
4,Mission Ammunition,0 - 20000,0,True,True,True,0,0,61,16,...,5,5,5,5,5,5,5,2017,10,27


In [17]:
# Make sure the column exists
if 'Estimated owners' in merged_df.columns:
    # Split on '-' and remove extra spaces
    owners_split = merged_df['Estimated owners'].str.split('-', expand=True)
    
    # Convert both sides to numeric (handle commas, spaces, etc.)
    owners_split = owners_split.apply(lambda x: x.str.replace(',', '').str.strip().astype(float))
    
    # Compute the average of the two values
    merged_df['Estimated owners'] = owners_split.mean(axis=1).round().astype('Int64')
else:
    print("Column 'Estimated owners' not found.")

display(merged_df.head())

Unnamed: 0,Name,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,...,ReleaseDateID,SalesID,PlatformsID,TagsID,PlaytimeID,ReviewsID,AppID,Year,Month,Day
0,Train Bandit,10000,0,True,True,False,0,0,53,5,...,1,1,1,1,1,1,1,2017,10,12
1,Project: R.E.B.O.O.T 2,35000,10,True,False,False,0,0,29,28,...,2,2,2,2,2,2,2,2017,1,27
2,Project MIKHAIL: A Muv-Luv War Story,35000,16,True,False,False,0,0,328,253,...,3,3,3,3,3,3,3,2021,11,1
3,The Warrior Of Treasures,10000,0,True,False,False,0,0,31,13,...,4,4,4,4,4,4,4,2018,1,3
4,Mission Ammunition,10000,0,True,True,True,0,0,61,16,...,5,5,5,5,5,5,5,2017,10,27


In [18]:
merged_df.dtypes

Name                           object
Estimated owners                Int64
Peak CCU                        int64
Windows                          bool
Mac                              bool
Linux                            bool
Metacritic score                int64
User score                      int64
Positive                        int64
Negative                        int64
Average playtime forever        int64
Average playtime two weeks      int64
Median playtime forever         int64
Median playtime two weeks       int64
Launch Price                  float64
Discounted Price              float64
Tags                           object
ReleaseDateID                   int64
SalesID                         int64
PlatformsID                     int64
TagsID                          int64
PlaytimeID                      int64
ReviewsID                       int64
AppID                           int64
Year                            Int64
Month                           Int64
Day         

In [19]:
# Create boolean columns for each tag in top_50_tags
for tag in top_50_tags:
    col_name = f"tag_{tag.replace(' ', '_').replace('-', '_')}"
    merged_df[col_name] = merged_df['Tags'].apply(lambda tags: int(tag in tags.split(',')))

print(merged_df[[f"tag_{tag.replace(' ', '_').replace('-', '_')}" for tag in top_50_tags]].head())

   tag_Singleplayer  tag_Indie  tag_Adventure  tag_Casual  tag_Action  tag_2D  \
0                 0          0              0           0           0       1   
1                 0          0              0           0           0       1   
2                 0          0              0           0           1       0   
3                 0          0              1           0           0       0   
4                 0          0              0           0           0       1   

   tag_Atmospheric  tag_Simulation  tag_Strategy  tag_Puzzle  ...  tag_Horror  \
0                0               0             0           0  ...           0   
1                0               0             0           0  ...           0   
2                0               0             0           0  ...           0   
3                0               0             0           0  ...           0   
4                0               0             0           0  ...           0   

   tag_Choices_Matter  tag

In [20]:
merged_df.dtypes

Name                 object
Estimated owners      Int64
Peak CCU              int64
Windows                bool
Mac                    bool
                      ...  
tag_JRPG              int64
tag_Management        int64
tag_Building          int64
tag_Side_Scroller     int64
tag_Physics           int64
Length: 77, dtype: object

In [21]:
merged_df.rename(columns={
    'Peak CCU': 'Peak_CCU',
    'Average playtime forever': 'AvgPlaytimeForever',
    'Average playtime two weeks': 'AvgPlaytimeTwoWeeks',
    'Median playtime forever': 'MedianPlaytimeForever',
    'Median playtime two weeks': 'MedianPlaytimeTwoWeeks'
}, inplace=True)

id_cols = ['ReleaseDateID', 'SalesID', 'PlatformsID', 'TagsID', 'PlaytimeID', 'ReviewsID', 'AppID']
merged_df[id_cols] = merged_df[id_cols].astype(int)

# Games table
games_df = merged_df[['AppID', 'Name', 'ReleaseDateID', 'SalesID', 'PlatformsID', 'TagsID', 'PlaytimeID', 'ReviewsID']].copy()
games_df.rename(columns={'Name': 'AppName'}, inplace=True)

# ReleaseDate table
releasedate_df = merged_df[['ReleaseDateID', 'Year', 'Month', 'Day']].copy()
releasedate_df.rename(columns={'Year': 'ReleaseYear', 'Month': 'ReleaseMonth', 'Day': 'ReleaseDay'}, inplace=True)

# Sales table
sales_df = merged_df[['SalesID', 'Launch Price', 'Discounted Price', 'Estimated owners']].copy()
sales_df.rename(columns={
    'Launch Price': 'Launch_Price',
    'Discounted Price': 'Discounted_Price',
    'Estimated owners': 'Estimated_Owners'
}, inplace=True)

# Platforms table
platforms_df = merged_df[['PlatformsID', 'Windows', 'Linux', 'Mac']].copy()

# Reviews table
reviews_df = merged_df[['ReviewsID', 'Metacritic score', 'User score', 'Positive', 'Negative']].copy()
reviews_df.rename(columns={
    'Metacritic score': 'Metacritic_Score',
    'User score': 'User_Score'
}, inplace=True)

# Playtime table
playtime_df = merged_df[['PlaytimeID', 'Peak_CCU', 'AvgPlaytimeForever', 'AvgPlaytimeTwoWeeks',
                         'MedianPlaytimeForever', 'MedianPlaytimeTwoWeeks']].copy()

# Tags table - tagsID and boolean columns for each top tag
tags_df = merged_df[['TagsID']].copy()
for tag in top_50_tags:
    col_name = f"tag_{tag.replace(' ', '_').replace('-', '_')}"
    tags_df[col_name] = merged_df[col_name]

print("✅ GAMES TABLE")
display(games_df.head())
print(games_df.dtypes, "\n")

print("✅ RELEASEDATE TABLE")
display(releasedate_df.head())
print(releasedate_df.dtypes, "\n")

print("✅ SALES TABLE")
display(sales_df.head())
print(sales_df.dtypes, "\n")

print("✅ PLATFORMS TABLE")
display(platforms_df.head())
print(platforms_df.dtypes, "\n")

print("✅ REVIEWS TABLE")
display(reviews_df.head())
print(reviews_df.dtypes, "\n")

print("✅ PLAYTIME TABLE")
display(playtime_df.head())
print(playtime_df.dtypes, "\n")

print("✅ TAGS TABLE")
display(tags_df.head())
print(tags_df.dtypes, "\n")

✅ GAMES TABLE


Unnamed: 0,AppID,AppName,ReleaseDateID,SalesID,PlatformsID,TagsID,PlaytimeID,ReviewsID
0,1,Train Bandit,1,1,1,1,1,1
1,2,Project: R.E.B.O.O.T 2,2,2,2,2,2,2
2,3,Project MIKHAIL: A Muv-Luv War Story,3,3,3,3,3,3
3,4,The Warrior Of Treasures,4,4,4,4,4,4
4,5,Mission Ammunition,5,5,5,5,5,5


AppID             int64
AppName          object
ReleaseDateID     int64
SalesID           int64
PlatformsID       int64
TagsID            int64
PlaytimeID        int64
ReviewsID         int64
dtype: object 

✅ RELEASEDATE TABLE


Unnamed: 0,ReleaseDateID,ReleaseYear,ReleaseMonth,ReleaseDay
0,1,2017,10,12
1,2,2017,1,27
2,3,2021,11,1
3,4,2018,1,3
4,5,2017,10,27


ReleaseDateID    int64
ReleaseYear      Int64
ReleaseMonth     Int64
ReleaseDay       Int64
dtype: object 

✅ SALES TABLE


Unnamed: 0,SalesID,Launch_Price,Discounted_Price,Estimated_Owners
0,1,0.99,0.32,10000
1,2,2.99,0.32,35000
2,3,19.99,10.46,35000
3,4,0.99,0.32,10000
4,5,4.99,0.77,10000


SalesID               int64
Launch_Price        float64
Discounted_Price    float64
Estimated_Owners      Int64
dtype: object 

✅ PLATFORMS TABLE


Unnamed: 0,PlatformsID,Windows,Linux,Mac
0,1,True,False,True
1,2,True,False,False
2,3,True,False,False
3,4,True,False,False
4,5,True,True,True


PlatformsID    int64
Windows         bool
Linux           bool
Mac             bool
dtype: object 

✅ REVIEWS TABLE


Unnamed: 0,ReviewsID,Metacritic_Score,User_Score,Positive,Negative
0,1,0,0,53,5
1,2,0,0,29,28
2,3,0,0,328,253
3,4,0,0,31,13
4,5,0,0,61,16


ReviewsID           int64
Metacritic_Score    int64
User_Score          int64
Positive            int64
Negative            int64
dtype: object 

✅ PLAYTIME TABLE


Unnamed: 0,PlaytimeID,Peak_CCU,AvgPlaytimeForever,AvgPlaytimeTwoWeeks,MedianPlaytimeForever,MedianPlaytimeTwoWeeks
0,1,0,0,0,0,0
1,2,10,337,0,289,0
2,3,16,670,0,670,0
3,4,0,0,0,0,0
4,5,0,0,0,0,0


PlaytimeID                int64
Peak_CCU                  int64
AvgPlaytimeForever        int64
AvgPlaytimeTwoWeeks       int64
MedianPlaytimeForever     int64
MedianPlaytimeTwoWeeks    int64
dtype: object 

✅ TAGS TABLE


Unnamed: 0,TagsID,tag_Singleplayer,tag_Indie,tag_Adventure,tag_Casual,tag_Action,tag_2D,tag_Atmospheric,tag_Simulation,tag_Strategy,...,tag_Horror,tag_Choices_Matter,tag_Sandbox,tag_Minimalist,tag_Visual_Novel,tag_JRPG,tag_Management,tag_Building,tag_Side_Scroller,tag_Physics
0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


TagsID                    int64
tag_Singleplayer          int64
tag_Indie                 int64
tag_Adventure             int64
tag_Casual                int64
tag_Action                int64
tag_2D                    int64
tag_Atmospheric           int64
tag_Simulation            int64
tag_Strategy              int64
tag_Puzzle                int64
tag_RPG                   int64
tag_Colorful              int64
tag_Story_Rich            int64
tag_Cute                  int64
tag_Pixel_Graphics        int64
tag_Family_Friendly       int64
tag_Arcade                int64
tag_Relaxing              int64
tag_Multiplayer           int64
tag_Exploration           int64
tag_Anime                 int64
tag_Funny                 int64
tag_3D                    int64
tag_Female_Protagonist    int64
tag_Difficult             int64
tag_Fantasy               int64
tag_Retro                 int64
tag_Great_Soundtrack      int64
tag_Platformer            int64
tag_Shooter               int64
tag_Open

In [22]:
# from sqlalchemy import create_engine

# # Replace credentials and database name as needed
# engine = create_engine("mysql+mysqlconnector://root:yourpassword@localhost/your_database")

# # Write tables — if they exist, replace contents
# games_df.to_sql('Games', con=engine, if_exists='replace', index=False)
# releasedate_df.to_sql('ReleaseDate', con=engine, if_exists='replace', index=False)
# sales_df.to_sql('Sales', con=engine, if_exists='replace', index=False)
# platforms_df.to_sql('Platforms', con=engine, if_exists='replace', index=False)
# reviews_df.to_sql('Reviews', con=engine, if_exists='replace', index=False)
# playtime_df.to_sql('Playtime', con=engine, if_exists='replace', index=False)

In [24]:
from sqlalchemy import create_engine, text
import pandas as pd

# 1️⃣ Create engine
engine = create_engine(
    "mysql+mysqlconnector://root:1234@localhost/steamdb",
    pool_pre_ping=True
)

# 2️⃣ Test connection
with engine.connect() as conn:
    conn.execute(text("SELECT 1"))
    print("✅ Connection OK")

# 3️⃣ Clean column names (spaces, hyphens, slashes can break table creation)
for df in [games_df, releasedate_df, sales_df, platforms_df, reviews_df, playtime_df, tags_df]:
    df.columns = (
        df.columns.str.strip()
                  .str.replace(" ", "_")
                  .str.replace("-", "_")
                  .str.replace("/", "_")
    )

# 4️⃣ Drop any existing tables manually (avoids reflection)
with engine.connect() as conn:
    conn.execute(text("DROP TABLE IF EXISTS Games, ReleaseDate, Sales, Platforms, Reviews, Playtime, Tags;"))
    print("🧹 Old tables dropped (if any).")

# 5️⃣ Manually create empty tables
with engine.connect() as conn:
    for name, df in {
        "Games": games_df,
        "ReleaseDate": releasedate_df,
        "Sales": sales_df,
        "Platforms": platforms_df,
        "Reviews": reviews_df,
        "Playtime": playtime_df,
        "Tags": tags_df
    }.items():
        cols = ", ".join(f"`{c}` TEXT" for c in df.columns)
        conn.execute(text(f"CREATE TABLE {name} ({cols});"))
        print(f"🧱 Created empty table: {name}")

# 6️⃣ Insert data without reflection
for name, df in {
    "Games": games_df,
    "ReleaseDate": releasedate_df,
    "Sales": sales_df,
    "Platforms": platforms_df,
    "Reviews": reviews_df,
    "Playtime": playtime_df,
    "Tags": tags_df
}.items():
    df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)
    print(f"✅ Inserted data into: {name}")

print("🎉 All tables successfully created and loaded into MySQL!")

✅ Connection OK
🧹 Old tables dropped (if any).
🧱 Created empty table: Games
🧱 Created empty table: ReleaseDate
🧱 Created empty table: Sales
🧱 Created empty table: Platforms
🧱 Created empty table: Reviews
🧱 Created empty table: Playtime
🧱 Created empty table: Tags


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: Games


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: ReleaseDate


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: Sales


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: Platforms


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: Reviews


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


✅ Inserted data into: Playtime
✅ Inserted data into: Tags
🎉 All tables successfully created and loaded into MySQL!


  df.to_sql(name, con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


* code written with assistance from Copilot