In [1]:
import pandas as pd

# Load our original games.csv
df = pd.read_csv('games.csv')

# Load our metadata with lines=True
metadata = pd.read_json('games_metadata.json', lines=True)

# Merge the two files on app_id
merged_df = pd.merge(df, metadata, on='app_id')

# Preview
merged_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'games.csv'

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords 
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase text
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
        return text
    else:
        return ""

# Apply cleaning to the description column
merged_df['clean_description'] = merged_df['description'].apply(clean_text)

# Show first few cleaned descriptions
merged_df[['title', 'clean_description']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,clean_description
0,Prince of Persia: Warrior Within™,enter dark underworld prince persia warrior wi...
1,BRINK: Agents of Change,
2,Monaco: What's Yours Is Mine,monaco whats mine single player coop heist gam...
3,Escape Dead Island,escape dead island survivalmystery adventure l...
4,Dungeon of the ENDLESS™,dungeon endless roguelike dungeondefense game ...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  

# Fit and transform the clean descriptions
tfidf_matrix = tfidf.fit_transform(merged_df['clean_description'])

# Check the shape of the TF-IDF matrix
tfidf_matrix.shape


(50872, 5000)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample ~2000 games for memory-friendly computation
sampled_df = merged_df.sample(n=2000, random_state=42)

# Re-run TF-IDF on the smaller sample
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(sampled_df['clean_description'])

# Now compute cosine similarity safely
cosine_sim = cosine_similarity(tfidf_matrix)

# Check the shape of our similarity matrix
cosine_sim.shape


(2000, 2000)

In [None]:
# Function to get game recommendations
def recommend_games(title, top_n=5):
    # Reset index to make sure we can look things up by row
    sampled_df.reset_index(drop=True, inplace=True)

    # Find the index of the given game title
    if title not in sampled_df['title'].values:
        return f"'{title}' not found in sample set."

    idx = sampled_df[sampled_df['title'] == title].index[0]

    # Get pairwise similarity scores for that game
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort scores in descending order, skip the first (it's the game itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get the indexes of the most similar games
    game_indices = [i[0] for i in sim_scores]

    # Return the top N most similar games
    return sampled_df.iloc[game_indices][['title', 'genres', 'price']]


In [None]:
recommend_games("Prince of Persia: Warrior Within™", top_n=5)


"'Prince of Persia: Warrior Within™' not found in sample set."

In [None]:
sampled_df['title'].sort_values().reset_index(drop=True).head(50)


0                                         #SelfieTennis
1                                         //TODO: today
2                                   100 hidden eternals
3                                                  100$
4     100% Orange Juice - Sora & Sham (Cuties) Chara...
5     100% Orange Juice - Tsih & Tequila Character Pack
6                        100% Orange Juice - Witch Pack
7                                                 1000$
8                       1001 Jigsaw: Earth Chronicles 2
9                          180 Files: The Aegis Project
10                                    1943 Berlin Blitz
11                                                 1982
12                                               1HEART
13                         2022生存指南 2022 SURVIVAL GUIDE
14                        22 Racing Series | RTS-Racing
15                                    25 Cadre of Death
16                                                2Dark
17                             2MD: VR Football 

In [None]:
sampled_df.columns


Index(['app_id', 'title', 'date_release', 'win', 'mac', 'linux', 'rating',
       'positive_ratio', 'user_reviews', 'price_final', 'price_original',
       'discount', 'steam_deck', 'description', 'tags', 'clean_description'],
      dtype='object')

In [None]:
def recommend_games(title, top_n=5):
    sampled_df.reset_index(drop=True, inplace=True)

    if title not in sampled_df['title'].values:
        return f"'{title}' not found in sample set."

    idx = sampled_df[sampled_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    game_indices = [i[0] for i in sim_scores]

    # Return relevant info you actually have
    return sampled_df.iloc[game_indices][['title', 'tags', 'price_final']]


In [None]:
recommend_games("ASTROKILL", top_n=5)


Unnamed: 0,title,tags,price_final
319,Iron Fisticle,"[Action, Indie, Local Co-Op, 2D, Twin Stick Sh...",8.99
325,Zombie Arena,"[Early Access, Wargame, FPS, PvE, PvP, Shooter...",0.37
950,Warhammer 40000: Dawn of War III,"[Warhammer 40K, Strategy, RTS, Sci-fi, Multipl...",39.99
176,FIREGROUND Modern War,"[Exploration, Arena Shooter, Spectacle fighter...",0.49
1478,Sky Battles,"[Action, Indie, Strategy, Flight, Simulation, ...",3.99


In [None]:
# Define keyword groups for different moods
mood_keywords = {
    'Relaxed': ['relaxing', 'casual', 'chill', 'peaceful', 'cozy'],
    'Intense': ['intense', 'adrenaline', 'combat', 'fast-paced', 'chaotic'],
    'Emotional': ['emotional', 'story', 'journey', 'heartfelt', 'narrative'],
    'Scary': ['horror', 'dark', 'scary', 'fear', 'terrifying'],
    'Strategic': ['Strategy', 'tactical', 'puzzle', 'planning'],
    'Funny': ['funny', 'humor', 'comedy', 'wacky'],
}


In [None]:
def assign_mood(description):
    if not isinstance(description, str):
        return 'Unknown'
    
    description = description.lower()
    for mood, keywords in mood_keywords.items():
        for word in keywords:
            if word in description:
                return mood
    return 'Unknown'


In [None]:
# Add mood column to sampled_df
sampled_df['mood'] = sampled_df['description'].apply(assign_mood)

# Preview
sampled_df[['title', 'mood']].head(10)


Unnamed: 0,title,mood
0,Super Blackjack Battle 2 Turbo Edition - The C...,Unknown
1,Galactic Dominion,Emotional
2,Watching Delusion,Strategic
3,Long Road,Unknown
4,The Mutational,Unknown
5,Flying Neko Delivery,Unknown
6,Breach & Clear: Deadline Rebirth (2016),Strategic
7,Forza Horizon 5 2014 SafariZ 370Z,Unknown
8,Heroes Of Avranche,Unknown
9,Formata,Unknown
