# Steam 200k Cleaning

**Objective:** Clean the `steam_200k` dataset so that its `game_title` values
match with the values in the `name` column of the `games` dataset.

## Setup

In [1]:
# Import modules
from pathlib import Path
import re
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from tqdm import tqdm
from utils import display_df_info

## Load Data

In [2]:
# Setup paths
data_dir = Path.cwd().parent / 'data'
assert data_dir.exists(), f"Unable to locate directory: '{data_dir}'"

steam_200k_path = data_dir / 'steam-200k.csv'
games_path = data_dir / 'games.csv'

In [3]:
# Load Steam 200k Dataset
steam_200k_cols = ['user_id', 'game_title', 'behavior_name', 'hours', 'extra']
steam_200k = pd.read_csv(steam_200k_path, names=steam_200k_cols)
display_df_info(steam_200k, 'Steam 200k', nulls=False)

### Steam 200k

Unnamed: 0,user_id,game_title,behavior_name,hours,extra
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [4]:
# Load Games Dataset
games_raw = pd.read_csv(games_path, escapechar='\\')
display_df_info(games_raw, 'Games', nulls=False)

### Games

Unnamed: 0,app_id,name,release_date,is_free,price_overview,languages,type
0,10,Counter-Strike,2000-11-01,0,"{""final"": 819, ""initial"": 819, ""currency"": ""EU...","English<strong>*</strong>, French<strong>*</st...",game
1,20,Team Fortress Classic,1999-04-01,0,"{""final"": 499, ""initial"": 499, ""currency"": ""EU...","English, French, German, Italian, Spanish - Sp...",game
2,30,Day of Defeat,2003-05-01,0,"{""final"": 499, ""initial"": 499, ""currency"": ""EU...","English, French, German, Italian, Spanish - Spain",game
3,40,Deathmatch Classic,2001-06-01,0,"{""final"": 499, ""initial"": 499, ""currency"": ""EU...","English, French, German, Italian, Spanish - Sp...",game
4,50,Half-Life: Opposing Force,1999-11-01,0,"{""final"": 499, ""initial"": 499, ""currency"": ""EU...","English, French, German, Korean",game


## Initial Data Cleaning

In [5]:
# Drop unused columns
games = games_raw[['app_id', 'name']].rename(columns={'name': 'game_title'})
hours = (
    steam_200k
    .query(f"behavior_name == 'play'")
    .groupby(['user_id', 'game_title'], as_index=False)
    [['hours']].sum()
)
display_df_info(games, 'games', nulls=False)
display_df_info(hours, 'hours', nulls=False)

### games

Unnamed: 0,app_id,game_title
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


### hours

Unnamed: 0,user_id,game_title,hours
0,5250,Alien Swarm,4.9
1,5250,Cities Skylines,144.0
2,5250,Deus Ex Human Revolution,62.0
3,5250,Dota 2,0.2
4,5250,Portal 2,13.6


## Initial Matches

In [6]:
df1 = hours.merge(games, how='left')
display_df_info(df1, 'Merged Data', head=False)

### Merged Data

Unnamed: 0,Dtype,Null Count,Total,% Null
user_id,int64,0,70745,0.0%
game_title,object,0,70745,0.0%
hours,float64,0,70745,0.0%
app_id,float64,34940,70745,49.4%


## Normalize the Title for Additional Matches

In [12]:
def normalize_title(title: str) -> str:
    title = title.lower()
    title = title.strip()
    title = re.sub(r'[^\w\s]', '', title)
    title = re.sub(r'\s+', '', title)
    return title


hours_norm = hours.copy()
hours_norm['normalized_title'] = hours_norm['game_title'].map(normalize_title)

games_norm = games.copy()
games_norm['normalized_title'] = games['game_title'].map(normalize_title)
games_norm = games_norm.drop('game_title', axis=1)

df = hours_norm.merge(games_norm, how='left', on='normalized_title')
display_df_info(df, 'Merged Data', head=False)

### Merged Data

Unnamed: 0,Dtype,Null Count,Total,% Null
user_id,int64,0,71065,0.0%
game_title,object,0,71065,0.0%
hours,float64,0,71065,0.0%
normalized_title,object,0,71065,0.0%
app_id,float64,18151,71065,25.5%


## Apply Nearest Neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Filter for unmatched data
unmatched = df[df['app_id'].isnull()].copy()
display_df_info(unmatched, 'Unmatched Data', nulls=False)

# Get all game titles
game_titles = games_norm['normalized_title'].unique()

In [21]:
# Vectorize both sets of titles
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5))
tfidf_game_ids = vectorizer.fit_transform(game_titles)
tfidf_unmatched = vectorizer.transform(steam_unmatched['normalized_title'])

In [22]:
nn_model = NearestNeighbors(n_neighbors=1, metric='cosine', algorithm='brute')
nn_model.fit(tfidf_game_ids)

In [23]:
# Query nearest neighbor for each unmatched game
distances, indices = nn_model.kneighbors(tfidf_unmatched)

In [24]:
# Convert distances to similarity scores (1 - distance)
similarities = 1 - distances.flatten()
best_matches = indices.flatten()

In [25]:
# Assign match if similarity exceeds threshold
threshold = 0.8
valid_matches = similarities >= threshold
matched_names = np.where(valid_matches, game_titles[best_matches], None)
matched_scores = np.where(valid_matches, similarities, None)

In [31]:
# Get map of name-to-appID
name_to_app_id = games_norm[['normalized_title', 'app_id']] \
                 .set_index('normalized_title') \
                 .iloc[:, 0].to_dict()
list(name_to_app_id.items())[:5]

[('counterstrike', 10),
 ('teamfortressclassic', 20),
 ('dayofdefeat', 30),
 ('deathmatchclassic', 40),
 ('halflifeopposingforce', 50)]

In [None]:
# Add results to dataframe
steam_unmatched['fuzzy_match'] = matched_names
steam_unmatched['fuzzy_score'] = matched_scores
steam_unmatched['fuzzy_app_id'] = steam_unmatched['fuzzy_match'] \
                                  .map(name_to_app_id)