Step 1: Load the Data

In [110]:
import pandas as pd

# Load game details and metadata
games = pd.read_csv('../data/games.csv')
metadata = pd.read_json('../data/games_metadata.json', lines=True)

# Preview the data
print("Games shape:", games.shape)
print("Metadata shape:", metadata.shape)
games.head()


Games shape: (50872, 13)
Metadata shape: (50872, 3)


Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [111]:
print(games.columns.tolist())


['app_id', 'title', 'date_release', 'win', 'mac', 'linux', 'rating', 'positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount', 'steam_deck']


Merge Datasets on app_id

In [112]:
# Merge base game info and metadata
merged_df = pd.merge(games, metadata, on='app_id')

# Now merge extra platform/price/date info from games.csv
columns_to_add = ['app_id', 'date_release', 'win', 'mac', 'linux', 'price_final', 'user_reviews']
merged_df = pd.merge(merged_df, games[columns_to_add], on='app_id', how='left')


In [None]:
print(merged_df.columns.tolist())

['app_id', 'title', 'date_release_x', 'win_x', 'mac_x', 'linux_x', 'rating', 'positive_ratio', 'user_reviews_x', 'price_final_x', 'price_original', 'discount', 'steam_deck', 'description', 'tags', 'date_release_y', 'win_y', 'mac_y', 'linux_y', 'price_final_y', 'user_reviews_y']


Create combined_features for TF-IDF

In [114]:
# Fill NaN values
merged_df['tags'] = merged_df['tags'].fillna('').astype(str)
merged_df['description'] = merged_df['description'].fillna('').astype(str)

# Combine into a single text column
merged_df['combined_features'] = merged_df['tags'] + ' ' + merged_df['description']

# Assign moods to each game based on tags/description
def assign_mood(row):
    if 'horror' in row['tags'].lower() or 'zombies' in row['description'].lower():
        return 'horror'
    elif 'adventure' in row['tags'].lower():
        return 'fun'
    elif 'puzzle' in row['tags'].lower() or 'logic' in row['description'].lower():
        return 'brainy'
    elif 'survival' in row['tags'].lower() or 'intense' in row['description'].lower():
        return 'intense'
    elif 'relax' in row['description'].lower():
        return 'relaxing'
    else:
        return 'general'

# Apply mood column
merged_df['mood'] = merged_df.apply(assign_mood, axis=1)

# Preview
merged_df[['title', 'mood', 'combined_features']].head()


Unnamed: 0,title,mood,combined_features
0,Prince of Persia: Warrior Within™,fun,"['Action', 'Adventure', 'Parkour', 'Third Pers..."
1,BRINK: Agents of Change,general,['Action']
2,Monaco: What's Yours Is Mine,fun,"['Co-op', 'Stealth', 'Indie', 'Heist', 'Local ..."
3,Escape Dead Island,horror,"['Zombies', 'Adventure', 'Survival', 'Action',..."
4,Dungeon of the ENDLESS™,fun,"['Roguelike', 'Strategy', 'Tower Defense', 'Pi..."


Phase 3: TF-IDF + KNN Model Training

Step 1: Import Libraries

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle


Step 2: Vectorize Combined Features with TF-IDF

In [116]:
# Vectorize combined_features
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(merged_df['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (50872, 50289)


 Step 3: Train KNN Model

In [117]:
# Train NearestNeighbors model
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(tfidf_matrix)


Step 4: Save Your Model and Matrix as .pkl

In [118]:
import os

# Make sure 'models/' folder exists
os.makedirs('../models', exist_ok=True)


In [119]:
# Save model and matrix
with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)

with open('../models/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)


In [120]:
# Export cleaned and combined dataset for the UI
merged_df.to_csv("../data/cleaned_games.csv", index=False)


Adding Mood Filtering


In [121]:
# Add a simple mood label manually (just a demo — real model can expand later)
def assign_mood(row):
    if 'horror' in row['tags'].lower() or 'zombies' in row['description'].lower():
        return 'horror'
    elif 'adventure' in row['tags'].lower():
        return 'fun'
    elif 'puzzle' in row['tags'].lower() or 'logic' in row['description'].lower():
        return 'brainy'
    elif 'survival' in row['tags'].lower() or 'intense' in row['description'].lower():
        return 'intense'
    elif 'relax' in row['description'].lower():
        return 'relaxing'
    else:
        return 'general'

# Apply mood labeling
merged_df['mood'] = merged_df.apply(assign_mood, axis=1)

# Preview
merged_df[['title', 'tags', 'mood']].head()


Unnamed: 0,title,tags,mood
0,Prince of Persia: Warrior Within™,"['Action', 'Adventure', 'Parkour', 'Third Pers...",fun
1,BRINK: Agents of Change,['Action'],general
2,Monaco: What's Yours Is Mine,"['Co-op', 'Stealth', 'Indie', 'Heist', 'Local ...",fun
3,Escape Dead Island,"['Zombies', 'Adventure', 'Survival', 'Action',...",horror
4,Dungeon of the ENDLESS™,"['Roguelike', 'Strategy', 'Tower Defense', 'Pi...",fun


In [122]:
merged_df.rename(columns={
    'price_final_x': 'price',
    'date_release_x': 'release_date',
    'win_x': 'win',
    'mac_x': 'mac',
    'linux_x': 'linux',
    'user_reviews_x': 'user_reviews'
}, inplace=True)


In [123]:
merged_df.rename(columns={
    'price_final': 'price',
    'date_release': 'release_date',
    'win': 'win',
    'mac': 'mac',
    'linux': 'linux',
    'user_reviews': 'user_reviews',
    'playtime_forever': 'hours'  # use this if your hours column is named something else
}, inplace=True)


In [124]:
print(merged_df.columns.tolist())


['app_id', 'title', 'release_date', 'win', 'mac', 'linux', 'rating', 'positive_ratio', 'user_reviews', 'price', 'price_original', 'discount', 'steam_deck', 'description', 'tags', 'date_release_y', 'win_y', 'mac_y', 'linux_y', 'price_final_y', 'user_reviews_y', 'combined_features', 'mood']


Save updated CSV again

In [125]:
merged_df.to_csv("../data/cleaned_games.csv", index=False)


In [127]:
merged_df.rename(columns={
    'price_final_x': 'price',
    'date_release_x': 'release_date',
    'win_x': 'win',
    'mac_x': 'mac',
    'linux_x': 'linux',
    'user_reviews_x': 'user_reviews'
}, inplace=True)
