ETL OF Steam Datasets

In [1]:
import pandas as pd
from datasets import load_dataset
ds_huggingface = load_dataset('FronkonGames/steam-games-dataset', split='train')
df_huggingface = ds_huggingface.to_pandas()

# to csv
df_huggingface.to_csv('hugging_face.csv', index=False)

print('HuggingFace Steam Games:', getattr(df_huggingface, 'shape', None))
df_huggingface.head()

  from .autonotebook import tqdm as notebook_tqdm


HuggingFace Steam Games: (83560, 39)


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [2]:
df_steam_trends = pd.read_csv("https://docs.google.com/spreadsheets/d/1D5MErWbFJ2Gsde9QxJ_HNMltKfF6fHCYdv4OQpXdnZ4/export?format=csv&gid=352042934")

# to csv
df_steam_trends.to_csv('steam_trends_2023.csv', index=False)

print("Steam Trends 2023:", df_steam_trends.shape)
df_steam_trends.head()

Steam Trends 2023: (65111, 14)


Unnamed: 0,App ID,Title,Reviews Total,Reviews Score Fancy,Release Date,Reviews D7,Reviews D30,Reviews D90,Launch Price,Tags,name_slug,Revenue Estimated,Modified Tags,Steam Page
0,730,Counter-Strike: Global Offensive,7382695,88%,2012-08-21,,,,"$14,99","FPS, Shooter, Multiplayer, Competitive, Action...",,"$110 666 598,05","FPS_, Shooter_, Multiplayer_, Competitive_, Ac...",https://store.steampowered.com/app/730
1,578080,PUBG: BATTLEGROUNDS,2201296,57%,2017-12-21,,,,"$29,99","Survival, Shooter, Battle Royale, Multiplayer,...",,"$66 016 867,04","Survival_, Shooter_, Battle Royale_, Multiplay...",https://store.steampowered.com/app/578080
2,570,Dota 2,2017009,82%,2013-07-09,,,,"$29,99","Free to Play, MOBA, Multiplayer, Strategy, eSp...",,"$60 490 099,91","Free to Play_, MOBA_, Multiplayer_, Strategy_,...",https://store.steampowered.com/app/570
3,271590,Grand Theft Auto V,1322782,"89,85%",2015-04-13,,,,"$29,99","Open World, Action, Multiplayer, Crime, Automo...",,"$39 670 232,18","Open World_, Action_, Multiplayer_, Crime_, Au...",https://store.steampowered.com/app/271590
4,359550,Tom Clancy's Rainbow Six® Siege,978762,86%,2015-12-01,,,,"$59,99","FPS, PvP, eSports, Shooter, Multiplayer, Tacti...",,"$58 715 932,38","FPS_, PvP_, eSports_, Shooter_, Multiplayer_, ...",https://store.steampowered.com/app/359550


Modified web scraper code from https://github.com/lundkvistbenjamin/steam-sales-scraper/blob/main/steam_scraper.py

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

pages_html = ""
for page_number in range(1, 5):
    res = requests.get(f"https://store.steampowered.com/search/?supportedlang=english&specials=1&page={page_number}&ndl=1")
    pages_html += res.text

soup = BeautifulSoup(pages_html, "html.parser")
game_containers = soup.find_all("div", {"class": "responsive_search_name_combined"})

titles = [
    game.find("span", {"class": "title"}).text if game.find("span", {"class": "title"}) else None
    for game in game_containers
]

rating_system = ["Overwhelmingly Negative", "Very Negative", "Negative", "Mostly Negative",
                 "Mixed", "Mostly Positive", "Positive", "Very Positive", "Overwhelmingly Positive"]

ratings, reviews = [], []
for game in game_containers:
    rating_span = game.find("span", {"class": "search_review_summary"})
    if rating_span:
        data_tooltip = rating_span["data-tooltip-html"]
        rating = data_tooltip.split("<br>")[0]
        ratings.append(rating_system.index(rating))
        reviews.append(data_tooltip.split("<br>")[1].split(" ")[3])
    else:
        ratings.append(None)
        reviews.append(None)

# modified
def parse_price(price_str):
    if price_str:
        clean_price = re.sub(r"[^\d.,]", "", price_str)
        return float(clean_price.replace(",", ""))
    return None

discounts = [
    int(game.find("div", {"class": "discount_pct"}).text.strip("%"))
    if game.find("div", {"class": "discount_pct"}) else None
    for game in game_containers
]

prices = [
    parse_price(game.find("div", {"class": "discount_final_price"}).text)
    if game.find("div", {"class": "discount_final_price"}) else None
    for game in game_containers
]

original_prices = [
    parse_price(game.find("div", {"class": "discount_original_price"}).text)
    if game.find("div", {"class": "discount_original_price"}) else None
    for game in game_containers
]

release_dates = [
    game.find("div", {"class": "search_released"}).text.strip()
    if len(game.find("div", {"class": "search_released"}).text) > 2 else None
    for game in game_containers
]

win, lin, osx = [], [], []
for game in game_containers:
    platforms = [platform["class"][1] for platform in game.find_all("span", {"class": "platform_img"})]
    win.append(1 if "win" in platforms else 0)
    lin.append(1 if "linux" in platforms else 0)
    osx.append(1 if "mac" in platforms else 0)

current_time = datetime.now().strftime("%Y-%m-%d %H:%M")
fetch_times = [current_time for _ in game_containers]

data = {
    "Game Name": titles,
    "Rating": ratings,
    "#Reviews": reviews,
    "Discount%": discounts,
    "Price (€)": prices,
    "Original Price (€)": original_prices,
    "Release Date": release_dates,
    "Windows": win,
    "Linux": lin,
    "MacOS": osx,
    "Fetched At": fetch_times
}

df_steam_sales = pd.DataFrame(data)
print("Steam Sales Data:", df_steam_sales.shape)
df_steam_sales.head()
# to csv
df_steam_sales.to_csv('steam_sales.csv', index=False)

Steam Sales Data: (75, 11)


Clean hugging_face steam games data

In [4]:
import numpy as np

df_huggingface = pd.read_csv('hugging_face.csv')

print('Original shape:', df_huggingface.shape)
print('Columns count:', len(df_huggingface.columns))

# Drop not needed columns
patterns = [
    'required age','dlc count','about the game', 'reviews','genres',
    'supported languages','full audio languages','header image', 'price',
    'header email','support email','support url','email','website',
    'metacritic url','score rank','achievements','recommendations',
    'developers','publishers','categories','screenshots','movies', 'notes'
]
drop_cols = [c for c in df_huggingface.columns if any(p in c.lower() for p in patterns)]
if drop_cols:
    df_huggingface = df_huggingface.drop(columns=drop_cols)
print('Dropped columns:', drop_cols)
print('Shape after drop:', df_huggingface.shape)

release_cols = [c for c in df_huggingface.columns if 'release' in c.lower()]
for c in release_cols:
    df_huggingface[c] = pd.to_datetime(df_huggingface[c], errors='coerce')
    df_huggingface[c] = df_huggingface[c].dt.strftime('%Y-%m-%d')
    df_huggingface[c] = df_huggingface[c].fillna('')

# Drop rows missing important vals
cols_for_drop = ['appid','name','release date']
app_cols = [c for c in df_huggingface.columns if any(tok in c.lower().replace(' ', '') for tok in cols_for_drop)]
app_cols = list(dict.fromkeys(app_cols))
if app_cols:
    df_huggingface[app_cols] = df_huggingface[app_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_huggingface.shape[0]
    df_huggingface = df_huggingface.dropna(subset=app_cols, how='all')
    after_rows = df_huggingface.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null (checked cols: {app_cols})')

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_huggingface.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_huggingface.columns if c not in num_cols]
for c in num_cols:
    median = df_huggingface[c].median()
    df_huggingface[c] = df_huggingface[c].fillna(median)
for c in obj_cols:
    df_huggingface[c] = df_huggingface[c].fillna('')

# Drop exact duplicates and reset index
before = df_huggingface.shape[0]
df_huggingface = df_huggingface.drop_duplicates().reset_index(drop=True)
after = df_huggingface.shape[0]
print(f'Dropped {before - after} duplicate rows')

# Save cleaned dataset
df_huggingface.to_csv('hugging_face_cleaned.csv', index=False)
print('Cleaned shape:', df_huggingface.shape)

df_huggingface.head()

Original shape: (83560, 39)
Columns count: 39
Dropped columns: ['Required age', 'Price', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url', 'Score rank', 'Achievements', 'Recommendations', 'Notes', 'Developers', 'Publishers', 'Categories', 'Genres', 'Screenshots', 'Movies']
Shape after drop: (83560, 17)
Dropped columns: ['Required age', 'Price', 'DLC count', 'About the game', 'Supported languages', 'Full audio languages', 'Reviews', 'Header image', 'Website', 'Support url', 'Support email', 'Metacritic url', 'Score rank', 'Achievements', 'Recommendations', 'Notes', 'Developers', 'Publishers', 'Categories', 'Genres', 'Screenshots', 'Movies']
Shape after drop: (83560, 17)
Dropped 0 rows with null (checked cols: ['AppID', 'Name'])
Dropped 0 rows with null (checked cols: ['AppID', 'Name'])
Dropped 0 duplicate rows
Dropped 0 duplicate rows
Cleaned shape: (83560, 17)
Cleaned sha

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Tags
0,20200,Galactic Bowling,2008-10-21,0 - 20000,0,True,False,False,0,0,6,11,0,0,0,0,"Indie,Casual,Sports,Bowling"
1,655370,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,53,5,0,0,0,0,"Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc..."
2,1732930,Jolt Project,2021-11-17,0 - 20000,0,True,False,False,0,0,0,0,0,0,0,0,
3,1355720,Henosis™,2020-07-23,0 - 20000,0,True,True,True,0,0,3,0,0,0,0,0,"2D Platformer,Atmospheric,Surreal,Mystery,Puzz..."
4,1139950,Two Weeks in Painland,2020-02-03,0 - 20000,0,True,True,False,0,0,50,8,0,0,0,0,"Indie,Adventure,Nudity,Violent,Sexual Content,..."


Clean Steam Games 2023

In [5]:
df_steam_trends = pd.read_csv('steam_trends_2023.csv')

print('Original shape:', getattr(df_steam_trends, 'shape', None))
print('Columns:', df_steam_trends.columns.tolist())

def _norm(s):
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

# Drop unneeded cols
drop_patterns = [
    'reviews d7', 'reviews d30', 'reviews d90', 'name_slug',
    'revenue estimated', 'modified tags', 'steam page'
]
norm_drop = [_norm(p) for p in drop_patterns]
drop_cols = [c for c in df_steam_trends.columns if any(p in _norm(c) for p in norm_drop)]
if drop_cols:
    df_steam_trends = df_steam_trends.drop(columns=drop_cols, errors='ignore')
print('Dropped columns (if present):', drop_cols)
print('Shape after drop:', df_steam_trends.shape)

release_cols = [c for c in df_steam_trends.columns if 'release' in c.lower()]
for c in release_cols:
    df_steam_trends[c] = pd.to_datetime(df_steam_trends[c], errors='coerce')
    df_steam_trends[c] = df_steam_trends[c].dt.strftime('%Y-%m-%d')
    df_steam_trends[c] = df_steam_trends[c].fillna('')

# Convert object columns that look numeric into numeric dtype
for c in df_steam_trends.columns:
    if df_steam_trends[c].dtype == 'object':
        cleaned = df_steam_trends[c].astype(str).str.replace(r'[^0-9.\-]', '', regex=True)
        converted = pd.to_numeric(cleaned, errors='coerce')
        non_null = converted.notna().sum()
        if non_null >= max(1, 0.1 * len(df_steam_trends)):
            df_steam_trends[c] = converted
            print(f'Converted to numeric: {c} (non-null after conversion: {non_null})')

# Row reduction: drop rows missing important vals
id_tokens = ['app id','title','release date']
id_cols = [c for c in df_steam_trends.columns if any(tok in _norm(c) for tok in [t.replace('_','') for t in id_tokens])]
if id_cols:
    df_steam_trends[id_cols] = df_steam_trends[id_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_steam_trends.shape[0]
    df_steam_trends = df_steam_trends.dropna(subset=id_cols, how='all')
    after_rows = df_steam_trends.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null (checked cols: {id_cols})')

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_steam_trends.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_steam_trends.columns if c not in num_cols]
for c in num_cols:
    median = df_steam_trends[c].median()
    df_steam_trends[c] = df_steam_trends[c].fillna(median)
for c in obj_cols:
    df_steam_trends[c] = df_steam_trends[c].fillna('')

before = df_steam_trends.shape[0]
df_steam_trends = df_steam_trends.drop_duplicates().reset_index(drop=True)
after = df_steam_trends.shape[0]
print(f'Dropped {before - after} duplicate rows')
df_steam_trends.to_csv('steam_trends_cleaned.csv', index=False)
print('Cleaned shape:', df_steam_trends.shape)

df_steam_trends.head()

Original shape: (65111, 14)
Columns: ['App ID', 'Title', 'Reviews Total', 'Reviews Score Fancy', 'Release Date', 'Reviews D7', 'Reviews D30', 'Reviews D90', 'Launch Price', 'Tags', 'name_slug', 'Revenue Estimated', 'Modified Tags', 'Steam Page']
Dropped columns (if present): ['Reviews D7', 'Reviews D30', 'Reviews D90', 'name_slug', 'Revenue Estimated', 'Modified Tags', 'Steam Page']
Shape after drop: (65111, 7)
Converted to numeric: Reviews Score Fancy (non-null after conversion: 65111)
Converted to numeric: Launch Price (non-null after conversion: 65111)
Converted to numeric: Tags (non-null after conversion: 34710)
Dropped 0 rows with null (checked cols: ['Title'])
Dropped 0 duplicate rows
Cleaned shape: (65111, 7)


Unnamed: 0,App ID,Title,Reviews Total,Reviews Score Fancy,Release Date,Launch Price,Tags
0,730,Counter-Strike: Global Offensive,7382695,88,2012-08-21,1499,3.0
1,578080,PUBG: BATTLEGROUNDS,2201296,57,2017-12-21,2999,3.0
2,570,Dota 2,2017009,82,2013-07-09,2999,3.0
3,271590,Grand Theft Auto V,1322782,8985,2015-04-13,2999,3.0
4,359550,Tom Clancy's Rainbow Six® Siege,978762,86,2015-12-01,5999,3.0


Clean steam data sales

In [6]:
df_steam_sales = pd.read_csv('steam_sales.csv')

print('Original shape:', getattr(df_steam_sales, 'shape', None))
print('Columns:', df_steam_sales.columns.tolist())

def _norm(s):
    return re.sub(r'[^a-z0-9]', '', str(s).lower())

# Drop pnot needed cols
drop_patterns = ['discount%', 'rating', 'fetched at']
norm_drop = [re.sub(r'[^a-z0-9]', '', p.lower()) for p in drop_patterns]
drop_cols = [c for c in df_steam_sales.columns if any(p in _norm(c) for p in norm_drop)]
if drop_cols:
    df_steam_sales = df_steam_sales.drop(columns=drop_cols, errors='ignore')
print('Dropped columns (if present):', drop_cols)
print('Shape after drop:', df_steam_sales.shape)

# Format release date
release_cols = [c for c in df_steam_sales.columns if 'release' in c.lower()]
for c in release_cols:
    df_steam_sales[c] = pd.to_datetime(df_steam_sales[c], errors='coerce')
    df_steam_sales[c] = df_steam_sales[c].dt.strftime('%Y-%m-%d')
    df_steam_sales[c] = df_steam_sales[c].fillna('')
print('Formatted release columns:', release_cols)

# drop rows missing important vals
name_tokens = ['game name', 'release date', 'price']
name_cols = [c for c in df_steam_sales.columns if any(tok.replace(' ', '') in _norm(c) for tok in name_tokens)]
if name_cols:
    df_steam_sales[name_cols] = df_steam_sales[name_cols].replace({'': pd.NA, 'nan': pd.NA})
    before_rows = df_steam_sales.shape[0]
    df_steam_sales = df_steam_sales.dropna(subset=name_cols, how='all')
    after_rows = df_steam_sales.shape[0]
    print(f'Dropped {before_rows - after_rows} rows with null important vals (checked cols: {name_cols})')

# Handle missing values: numeric -> median, text -> empty string
num_cols = df_steam_sales.select_dtypes(include=['number']).columns.tolist()
obj_cols = [c for c in df_steam_sales.columns if c not in num_cols]
for c in num_cols:
    median = df_steam_sales[c].median()
    df_steam_sales[c] = df_steam_sales[c].fillna(median)
for c in obj_cols:
    df_steam_sales[c] = df_steam_sales[c].fillna('')

# Drop exact duplicates and reset index
before = df_steam_sales.shape[0]
df_steam_sales = df_steam_sales.drop_duplicates().reset_index(drop=True)
after = df_steam_sales.shape[0]
print(f'Dropped {before - after} duplicate rows')

# Save cleaned dataset
df_steam_sales.to_csv('steam_sales_cleaned.csv', index=False)
print('Cleaned shape:', df_steam_sales.shape)

df_steam_sales.head()

Original shape: (75, 11)
Columns: ['Game Name', 'Rating', '#Reviews', 'Discount%', 'Price (€)', 'Original Price (€)', 'Release Date', 'Windows', 'Linux', 'MacOS', 'Fetched At']
Dropped columns (if present): ['Rating', 'Discount%', 'Fetched At']
Shape after drop: (75, 8)
Formatted release columns: ['Release Date']
Dropped 0 rows with null important vals (checked cols: ['Game Name', 'Price (€)', 'Original Price (€)', 'Release Date'])
Dropped 0 duplicate rows
Cleaned shape: (75, 8)


Unnamed: 0,Game Name,#Reviews,Price (€),Original Price (€),Release Date,Windows,Linux,MacOS
0,Grand Theft Auto V Enhanced,31994,849.5,1699.0,2025-03-04,1,0,0
1,Euro Truck Simulator 2,131366,161.25,645.0,2012-10-12,1,1,1
2,CloverPit,4048,301.5,335.0,2025-09-26,1,0,0
3,Little Nightmares III Pre-Order Bundle,303,1625.64,1050.0,,1,0,0
4,American Truck Simulator,86040,161.25,645.0,2016-02-02,1,1,1


Combining the datasets

In [7]:
# Load all cleaned datasets
df_hf = pd.read_csv('hugging_face_cleaned.csv')
df_sales = pd.read_csv('steam_sales_cleaned.csv')
df_trends = pd.read_csv('steam_trends_cleaned.csv')

# Rename game names/titles
df_sales = df_sales.rename(columns={'Game Name': 'Name'})
df_trends = df_trends.rename(columns={'Title': 'Name'})

# Normalize game names for better matching
def normalize_name(name):
    if not isinstance(name, str):
        return ''
    return (name.lower()
            .replace('®', '')
            .replace('™', '')
            .replace('  ', ' ')
            .strip())

# Create normalized name columns
df_hf['name_norm'] = df_hf['Name'].apply(normalize_name)
df_trends['name_norm'] = df_trends['Name'].apply(normalize_name)

# Handle duplicates by keeping only the first occurrence of each game
print("Before handling duplicates:")
print(f"Hugging Face rows: {len(df_hf)}")
print(f"Unique names in Hugging Face: {df_hf['name_norm'].nunique()}")
print(f"Trends rows: {len(df_trends)}")
print(f"Unique names in Trends: {df_trends['name_norm'].nunique()}")

# Keep only the first occurrence of each game in both dataframes
df_hf_deduped = df_hf.drop_duplicates(subset=['name_norm'], keep='first')
df_trends_deduped = df_trends.drop_duplicates(subset=['name_norm'], keep='first')

print("\nAfter removing duplicates:")
print(f"Hugging Face rows: {len(df_hf_deduped)}")
print(f"Trends rows: {len(df_trends_deduped)}")

# Find common names after deduplication
common_names = set(df_hf_deduped['name_norm']).intersection(
    set(df_trends_deduped['name_norm'])
)
print(f"\nNumber of common game names after deduplication: {len(common_names)}")

# Perform the merge on deduplicated data
merged_df = pd.merge(
    df_hf_deduped,
    df_trends_deduped.drop(columns=['Name']),
    on='name_norm',
    how='inner',
    suffixes=('_hf', '_trends')
)

print("\nAfter merging (should match common names count):")
print(f"Merged dataframe shape: {merged_df.shape}")
print("\nSample of merged data (first 5 rows):")
display(merged_df.head())

# Show duplicate names if any
print("\nChecking for any remaining duplicates in the merged data:")
duplicate_names = merged_df[merged_df.duplicated('name_norm', keep=False)]
if len(duplicate_names) > 0:
    print(f"Found {len(duplicate_names)} rows with duplicate names:")
    display(duplicate_names[['name_norm', 'Name']].sort_values('name_norm'))
else:
    print("No duplicate names found in the merged data.")

# Handle release dates - prefer the first dataset's date if both exist
merged_df['Release date'] = merged_df['Release date'].fillna(merged_df['Release Date'])

# Combine tags from both datasets and remove duplicates
def combine_tags(tags_hf, tags_trends):
    if pd.isna(tags_hf) and pd.isna(tags_trends):
        return ''
    tags = set()
    if pd.notna(tags_hf):
        tags.update(str(tags_hf).split(','))
    if pd.notna(tags_trends):
        tags.update(str(tags_trends).split(','))
    return ','.join(sorted(tags)) if tags else ''

merged_df['Tags'] = merged_df.apply(
    lambda x: combine_tags(x['Tags_hf'], x['Tags_trends']), 
    axis=1
)

# Standardize price column names
merged_df = merged_df.rename(columns={'Price': 'Discounted Price'})

# Clean up - drop date, app id, tags columns
merged_df = merged_df.drop(columns=[
    'Release Date', 
    'Tags_hf',
    'Tags_trends',
    'App ID',
    'AppID'
])

print("\nUpdated columns after combining:")
print(merged_df.columns.tolist())
print("\nSample of combined data with new fields:")
display(merged_df.head())

# to csv
merged_df.to_csv('df_merged_hf_trends.csv', index=False)


Before handling duplicates:
Hugging Face rows: 83560
Unique names in Hugging Face: 82628
Trends rows: 65111
Unique names in Trends: 64597

After removing duplicates:
Hugging Face rows: 82628
Trends rows: 64597

Number of common game names after deduplication: 63202

After merging (should match common names count):
Merged dataframe shape: (63202, 24)

Sample of merged data (first 5 rows):

After merging (should match common names count):
Merged dataframe shape: (63202, 24)

Sample of merged data (first 5 rows):


Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,...,Median playtime forever,Median playtime two weeks,Tags_hf,name_norm,App ID,Reviews Total,Reviews Score Fancy,Release Date,Launch Price,Tags_trends
0,20200,Galactic Bowling,2008-10-21,0 - 20000,0,True,False,False,0,0,...,0,0,"Indie,Casual,Sports,Bowling",galactic bowling,20200,12,33,2008-10-21,1999,3.0
1,655370,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,...,0,0,"Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",train bandit,655370,53,86,2017-10-12,99,2.0
2,1732930,Jolt Project,2021-11-17,0 - 20000,0,True,False,False,0,0,...,0,0,,jolt project,1732930,0,0,2021-11-17,499,3.0
3,1355720,Henosis™,2020-07-23,0 - 20000,0,True,True,True,0,0,...,0,0,"2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",henosis,1355720,0,0,2020-07-23,999,22.0
4,1139950,Two Weeks in Painland,2020-02-03,0 - 20000,0,True,True,False,0,0,...,0,0,"Indie,Adventure,Nudity,Violent,Sexual Content,...",two weeks in painland,1139950,59,89,2020-02-04,999,3.0



Checking for any remaining duplicates in the merged data:
No duplicate names found in the merged data.

Updated columns after combining:
['Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'name_norm', 'Reviews Total', 'Reviews Score Fancy', 'Launch Price', 'Tags']

Sample of combined data with new fields:

Updated columns after combining:
['Name', 'Release date', 'Estimated owners', 'Peak CCU', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Positive', 'Negative', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'name_norm', 'Reviews Total', 'Reviews Score Fancy', 'Launch Price', 'Tags']

Sample of combined data with new fields:


Unnamed: 0,Name,Release date,Estimated owners,Peak CCU,Windows,Mac,Linux,Metacritic score,User score,Positive,Negative,Average playtime forever,Average playtime two weeks,Median playtime forever,Median playtime two weeks,name_norm,Reviews Total,Reviews Score Fancy,Launch Price,Tags
0,Galactic Bowling,2008-10-21,0 - 20000,0,True,False,False,0,0,6,11,0,0,0,0,galactic bowling,12,33,1999,"3.0,Bowling,Casual,Indie,Sports"
1,Train Bandit,2017-10-12,0 - 20000,0,True,True,False,0,0,53,5,0,0,0,0,train bandit,53,86,99,"2.0,2D,Action,Arcade,Blood,Casual,Comedy,Contr..."
2,Jolt Project,2021-11-17,0 - 20000,0,True,False,False,0,0,0,0,0,0,0,0,jolt project,0,0,499,3.0
3,Henosis™,2020-07-23,0 - 20000,0,True,True,True,0,0,3,0,0,0,0,0,henosis,0,0,999,"22.0,2D,2D Platformer,Adventure,Atmospheric,Ca..."
4,Two Weeks in Painland,2020-02-03,0 - 20000,0,True,True,False,0,0,50,8,0,0,0,0,two weeks in painland,59,89,999,"3.0,Adventure,Indie,Nudity,Sexual Content,Stor..."


- Combine & match merged data with steam sales dataset (use more pages for more data)
- remove appid, then make new appid for index
- make sql from final merged data

* code written with assistance from Copilot