In [30]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
interaction_cols = ['UserID', 'GameName', 'Action', 'HoursPlayed', 'Dummy']

df_interactions = pd.read_csv('/kaggle/input/steam-video-games/steam-200k.csv', header=None, names=interaction_cols)
df_interactions.drop('Dummy', axis=1, inplace=True)

In [4]:
df_interactions.head()

Unnamed: 0,UserID,GameName,Action,HoursPlayed
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


In [5]:
df_play = df_interactions[df_interactions['Action'] == 'play'].copy()

In [6]:
df_play.drop('Action', axis=1, inplace=True)

In [7]:
df_play.head()

Unnamed: 0,UserID,GameName,HoursPlayed
1,151603712,The Elder Scrolls V Skyrim,273.0
3,151603712,Fallout 4,87.0
5,151603712,Spore,14.9
7,151603712,Fallout New Vegas,12.1
9,151603712,Left 4 Dead 2,8.9


In [8]:
print(len(df_play))

70489


In [9]:
df_metadata = pd.read_csv("/kaggle/input/steam-store-games/steam.csv")

In [10]:
df_metadata.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [11]:
# Assuming the metadata is in df_metadata
# Select and rename columns for clarity
df_game_info = df_metadata[['appid', 'name']].copy()
df_game_info.rename(columns={'name': 'GameName'}, inplace=True)

In [12]:
df_merged = pd.merge(df_play, df_game_info, on='GameName', how='inner')

In [13]:
df_merged.head()

Unnamed: 0,UserID,GameName,HoursPlayed,appid
0,151603712,Fallout 4,87.0,377160
1,151603712,Left 4 Dead 2,8.9,550
2,151603712,HuniePop,8.5,339800
3,151603712,Path of Exile,8.1,238960
4,151603712,Poly Bridge,7.5,367450


In [14]:
df_merged['ImplicitRating'] = np.log1p(df_merged['HoursPlayed'])

In [15]:
df_merged.head()

Unnamed: 0,UserID,GameName,HoursPlayed,appid,ImplicitRating
0,151603712,Fallout 4,87.0,377160,4.477337
1,151603712,Left 4 Dead 2,8.9,550,2.292535
2,151603712,HuniePop,8.5,339800,2.251292
3,151603712,Path of Exile,8.1,238960,2.208274
4,151603712,Poly Bridge,7.5,367450,2.140066


In [None]:
df_merged.drop('GameName',axis=1, inplace=True)

In [18]:
df_merged.drop('HoursPlayed',axis=1, inplace=True)

In [19]:
df_merged.head()

Unnamed: 0,UserID,appid,ImplicitRating
0,151603712,377160,4.477337
1,151603712,550,2.292535
2,151603712,339800,2.251292
3,151603712,238960,2.208274
4,151603712,367450,2.140066


# --- 2. Encode Users and Games ---

In [20]:
user_ids = df_merged["UserID"].unique().tolist()
user_to_encoded = {x: i for i, x in enumerate(user_ids)}
encoded_to_user = {i: x for i, x in enumerate(user_ids)}
df_merged["user_encoded"] = df_merged["UserID"].map(user_to_encoded)

In [21]:
game_ids = df_merged["appid"].unique().tolist()
game_to_encoded = {x: i for i, x in enumerate(game_ids)}
encoded_to_game = {i: x for i, x in enumerate(game_ids)}
df_merged["game_encoded"] = df_merged["appid"].map(game_to_encoded)

In [22]:
n_users = len(user_to_encoded)
n_games = len(game_to_encoded)

In [23]:
print(f"Number of Users: {n_users}")
print(f"Number of Games: {n_games}")

Number of Users: 9248
Number of Games: 1724


# --- 3. Normalize Ratings ---


In [24]:
df_merged["rating_normalized"] = df_merged["ImplicitRating"].apply(
    lambda x: (x - min(df_merged["ImplicitRating"])) / (max(df_merged["ImplicitRating"]) - min(df_merged["ImplicitRating"]))
).values

In [27]:
df_merged.head()

Unnamed: 0,UserID,appid,ImplicitRating,user_encoded,game_encoded,rating_normalized
0,151603712,377160,4.477337,0,0,0.478472
1,151603712,550,2.292535,0,1,0.239914
2,151603712,339800,2.251292,0,2,0.235411
3,151603712,238960,2.208274,0,3,0.230714
4,151603712,367450,2.140066,0,4,0.223266


# --- 4. Create Training and Testing Sets ---


In [25]:
X = df_merged[["user_encoded", "game_encoded"]].values
y = df_merged["rating_normalized"].values

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nData preprocessing complete.")
print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")


Data preprocessing complete.
Train samples: 29041
Test samples: 7261


In [36]:
def RecommenderNet(n_users, n_games, embedding_dim=128):
    """
    Defines the matrix factorization model using Keras.
    """
    user_input = Input(shape=[1], name="user_input")
    user_embedding = Embedding(n_users, embedding_dim, name="user_embedding")(user_input)
    user_vec = Flatten(name="flatten_user")(user_embedding)

    game_input = Input(shape=[1], name="game_input")
    game_embedding = Embedding(n_games, embedding_dim, name="game_embedding")(game_input)
    game_vec = Flatten(name="flatten_game")(game_embedding)

    dot_product = Dot(axes=1, name="dot_product")([user_vec, game_vec])

    output = Dense(1, activation="sigmoid", name="output")(dot_product)

    model = Model(inputs=[user_input, game_input], outputs=output)
    return model

In [37]:
model = RecommenderNet(n_users, n_games, embedding_dim=128)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['mae']  # Mean Absolute Error
)

model.summary()

In [38]:
history = model.fit(
    x=[X_train[:, 0], X_train[:, 1]],
    y=y_train,
    batch_size=64,
    epochs=20,  # We can start with 10 epochs and see how it performs
    verbose=1,
    validation_data=([X_test[:, 0], X_test[:, 1]], y_test)
)

Epoch 1/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.6639 - mae: 0.2867 - val_loss: 0.5894 - val_mae: 0.2279
Epoch 2/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5447 - mae: 0.1801 - val_loss: 0.5239 - val_mae: 0.1595
Epoch 3/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4602 - mae: 0.0844 - val_loss: 0.5160 - val_mae: 0.1509
Epoch 4/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4346 - mae: 0.0466 - val_loss: 0.5162 - val_mae: 0.1506
Epoch 5/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4261 - mae: 0.0310 - val_loss: 0.5162 - val_mae: 0.1502
Epoch 6/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.4258 - mae: 0.0248 - val_loss: 0.5165 - val_mae: 0.1502
Epoch 7/20
[1m454/454[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - 

In [39]:
def find_similar_games(name, n=10):
    """
    Finds the top N most similar games to a given game name.
    """
    # 1. Extract the learned game weights (embeddings) from the model
    game_embedding_weights = model.get_layer('game_embedding').get_weights()[0]

    # Normalize the embeddings for consistent similarity scores
    game_embedding_weights = game_embedding_weights / np.linalg.norm(game_embedding_weights, axis=1).reshape((-1, 1))

    # 2. Get the encoded ID for the input game name
    try:
        game_id = df_game_info[df_game_info['GameName'] == name]['appid'].values[0]
        encoded_id = game_to_encoded[game_id]
    except (IndexError, KeyError):
        print(f"Game '{name}' not found in the dataset.")
        return None

    # 3. Calculate the dot product (similarity) with all other games
    dists = np.dot(game_embedding_weights, game_embedding_weights[encoded_id])
    sorted_dists = np.argsort(dists)

    # 4. Get the top N closest game IDs (excluding the game itself)
    closest = sorted_dists[-n-1:-1]
    
    # 5. Get the game names and return them
    similar_games = []
    for c in reversed(closest):
        game_name = df_game_info[df_game_info['appid'] == encoded_to_game[c]]['GameName'].values[0]
        similar_games.append(game_name)
        
    return pd.DataFrame(similar_games, columns=['Similar Games'])

In [40]:
recommendations = find_similar_games("Counter-Strike")
print(recommendations)

                        Similar Games
0                           BlazeRush
1                         Enemy Front
2                     Divine Divinity
3                        Gunman Clive
4                Planets Under Attack
5      Shantae and the Pirate's Curse
6                 Another Perspective
7  Wasteland 1 - The Original Classic
8                           Coniclysm
9                  Planet of the Eyes


In [41]:
def find_similar_users(user_id, n=10):
    """
    Finds the top N most similar users to a given user ID.
    """
    # Extract user embeddings and normalize them
    user_embedding_weights = model.get_layer('user_embedding').get_weights()[0]
    user_embedding_weights = user_embedding_weights / np.linalg.norm(user_embedding_weights, axis=1).reshape((-1, 1))

    # Get the encoded ID for the input user
    try:
        encoded_id = user_to_encoded[user_id]
    except KeyError:
        print(f"User '{user_id}' not found.")
        return None

    # Calculate similarity with all other users
    dists = np.dot(user_embedding_weights, user_embedding_weights[encoded_id])
    sorted_dists = np.argsort(dists)
    
    # Get the top N closest user IDs
    closest = sorted_dists[-n-1:-1]
    similar_users = [encoded_to_user[c] for c in reversed(closest)]
    
    return pd.DataFrame(similar_users, columns=['Similar Users'])

In [42]:
def get_user_preferences(user_id):
    """
    Gets a DataFrame of games a user has rated highly.
    """
    games_played = df_merged[df_merged["UserID"] == user_id]
    
    # Get the user's 75th percentile rating to define "high rating"
    rating_percentile = np.percentile(games_played.rating_normalized, 75)
    
    # Filter for games rated at or above the percentile
    top_games = games_played[games_played["rating_normalized"] >= rating_percentile]
    top_games = top_games.sort_values("rating_normalized", ascending=False)
    
    # Join with metadata to get game names
    top_games_with_names = top_games.merge(df_game_info[['appid', 'GameName']], on='appid')
    
    return top_games_with_names

In [43]:
def get_user_recommendations(user_id, n=10):
    """
    Generates top N game recommendations for a user.
    """
    # Find similar users
    similar_users_df = find_similar_users(user_id)
    if similar_users_df is None:
        return None
    
    similar_users = similar_users_df['Similar Users'].tolist()

    # Get the preferences of those similar users
    recommended_games = []
    for other_user in similar_users:
        top_games = get_user_preferences(other_user)
        recommended_games.extend(top_games['GameName'].tolist())

    # Get games the original user has already played to filter them out
    user_played_games = get_user_preferences(user_id)['GameName'].tolist()

    # Count the recommendations and filter out played games
    rec_counts = pd.Series(recommended_games).value_counts()
    rec_counts = rec_counts[~rec_counts.index.isin(user_played_games)]
    
    return pd.DataFrame(rec_counts.head(n)).reset_index().rename(columns={'index': 'Recommended Game', 0: 'Recommendation Score'})


In [44]:
test_user_id = 151603712 
recommendations = get_user_recommendations(test_user_id)

print(f"Top 10 Recommendations for User {test_user_id}:")
print(recommendations)

Top 10 Recommendations for User 151603712:
    Recommended Game  count
0        Garry's Mod      4
1    Team Fortress 2      2
2   This War of Mine      1
3  Company of Heroes      1
4          Far Cry 3      1
5         L.A. Noire      1
6        Democracy 3      1
7          Tropico 5      1
8           Terraria      1
9      Killing Floor      1


In [45]:
def hybrid_recommendation(user_id, n=10, user_weight=1.0, content_weight=0.5):
    """
    Generates a hybrid recommendation by combining user-based and content-based models.
    """
    # 1. Get user-based collaborative filtering recommendations
    user_recs_df = get_user_recommendations(user_id, n=20) # Get a larger list to start
    if user_recs_df is None:
        return "User not found."
    user_recs = user_recs_df['Recommended Game'].tolist()
    
    # 2. Get content-based recommendations
    # Find games similar to the user's top preferences
    user_top_games = get_user_preferences(user_id)['GameName'].tolist()
    content_recs = []
    for game_name in user_top_games[:5]: # Use top 5 games as seed
        similar_games = find_similar_games(game_name)
        if similar_games is not None:
            content_recs.extend(similar_games['Similar Games'].tolist())

    # 3. Combine and re-rank the recommendations
    combined_scores = {}

    # Add scores from user-based recommendations
    for i, game in enumerate(user_recs):
        score = user_weight * (1 / (i + 1)) # Score based on rank
        combined_scores[game] = combined_scores.get(game, 0) + score
    
    # Add scores from content-based recommendations
    for i, game in enumerate(content_recs):
        score = content_weight * (1 / (i + 1))
        combined_scores[game] = combined_scores.get(game, 0) + score

    # Sort games by their combined score
    sorted_recs = sorted(combined_scores.items(), key=lambda item: item[1], reverse=True)
    
    # Filter out games the user has already played
    final_recs = [(rec, score) for rec, score in sorted_recs if rec not in user_top_games]
    
    return pd.DataFrame(final_recs[:n], columns=['Recommended Game', 'Hybrid Score'])

In [46]:
test_user_id = 151603712
hybrid_recommendations = hybrid_recommendation(test_user_id)

print(f"Hybrid Recommendations for User {test_user_id}:")
print(hybrid_recommendations)

Hybrid Recommendations for User 151603712:
    Recommended Game  Hybrid Score
0        Garry's Mod      1.000000
1    Team Fortress 2      0.500000
2    Light of Altair      0.500000
3   This War of Mine      0.333333
4  Company of Heroes      0.250000
5      Patrician III      0.250000
6          Far Cry 3      0.200000
7         L.A. Noire      0.166667
8          Avernum 4      0.166667
9        Democracy 3      0.142857
