In [None]:
import pandas as pd

# Load your original games.csv
df = pd.read_csv('games.csv')

# Load your metadata with lines=True
metadata = pd.read_json('games_metadata.json', lines=True)

# Merge the two files on app_id
merged_df = pd.merge(df, metadata, on='app_id')

# Preview
merged_df.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,description,tags
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True,,[Action]
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase text
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
        return text
    else:
        return ""

# Apply cleaning to the description column
merged_df['clean_description'] = merged_df['description'].apply(clean_text)

# Show first few cleaned descriptions
merged_df[['title', 'clean_description']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,clean_description
0,Prince of Persia: Warrior Within™,enter dark underworld prince persia warrior wi...
1,BRINK: Agents of Change,
2,Monaco: What's Yours Is Mine,monaco whats mine single player coop heist gam...
3,Escape Dead Island,escape dead island survivalmystery adventure l...
4,Dungeon of the ENDLESS™,dungeon endless roguelike dungeondefense game ...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust max_features if needed

# Fit and transform the clean descriptions
tfidf_matrix = tfidf.fit_transform(merged_df['clean_description'])

# Check the shape of the TF-IDF matrix
tfidf_matrix.shape


(50872, 5000)