https://www.kaggle.com/datasets/tamber/steam-video-games?resource=download

In [86]:
import pandas as pd
import torch

df = pd.read_csv("data/steam-200k.csv")
df.columns = ['user_id', 'title', 'action', 'hours', 'x']

# del last col
df = df.drop('x', axis=1)

# remove all purchase actions
df = df.drop(df[df['action'] == 'purchase'].index)

In [87]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [88]:
df

Unnamed: 0,user_id,title,action,hours
0,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,play,87.0
4,151603712,Spore,play,14.9
6,151603712,Fallout New Vegas,play,12.1
8,151603712,Left 4 Dead 2,play,8.9
...,...,...,...,...
199990,128470551,Fallen Earth,play,2.4
199992,128470551,Magic Duels,play,2.2
199994,128470551,Titan Souls,play,1.5
199996,128470551,Grand Theft Auto Vice City,play,1.5


In [89]:
df.hours.value_counts()

hours
0.2       3016
0.3       2517
0.4       2129
0.5       1813
0.1       1787
          ... 
724.0        1
737.0        1
3503.0       1
1397.0       1
1310.0       1
Name: count, Length: 1593, dtype: int64

In [90]:
n_users = df['user_id'].nunique()
n_items = df['title'].nunique()


print("Number of unique users is:", n_users)
print("Number of unique games is:", n_items)
print("Number of ratings", len(df))
print("Matrix size:", n_users*n_items)
print("Percent of matrix that is filled:", len(df) / (n_users*n_items) * 100, "%")

Number of unique users is: 11350
Number of unique games is: 3600
Number of ratings 70489
Matrix size: 40860000
Percent of matrix that is filled: 0.17251346059716105 %


In [91]:
import torch
import numpy as np
#import mathplotlib.pyplot as plt
from sklearn import preprocessing, model_selection
from torch.utils.data import DataLoader, Dataset


class GameDataset(Dataset):
    def __init__(self, users, items, hours):
        self.users = users
        self.items = items
        self.hours = hours
        
    # len(movie_dataset)
    def __len__(self): # Number of Users
        return len(self.users)

    # movie_dataset[1]
    def __getitem__(self, idx):

        users = self.users[idx]
        items = self.items[idx]
        hours = self.hours[idx]

        return {
            "user_id" : torch.tensor(users, dtype=torch.long),
            "title" : torch.tensor(items, dtype=torch.long),
            "hours" : torch.tensor(hours, dtype=torch.float)
        }

In [92]:
class RecSysModel(torch.nn.Module):
    def __init__(self, n_users, n_items, embedding_size=256, hidden_dim=256, dropout_rate=0.2):
        super().__init__()

        #create embeddings
        self.user_embed = torch.nn.Embedding(num_embeddings=n_users, embedding_dim=embedding_size)
        self.item_embed = torch.nn.Embedding(num_embeddings=n_items, embedding_dim=embedding_size)


        # hidden layers
        self.fc1 = torch.nn.Linear(2 * embedding_size, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

        self.dropout = torch.nn.Dropout(p=dropout_rate)

        self.relu = torch.nn.ReLU()
        
    def forward(self, users, items, hours=None):
        user_embeds = self.user_embed(users)
        item_embeds = self.item_embed(items)
        
        output = torch.cat([user_embeds, item_embeds], dim=1)

        x = self.relu(self.fc1(output))
        x = self.dropout(x)
        output = self.fc2(x)
        return output

In [93]:
lbl_user = preprocessing.LabelEncoder()
lbl_item = preprocessing.LabelEncoder()
df['user_id'] = lbl_user.fit_transform(df['user_id'].values)
df['title'] = lbl_item.fit_transform(df['title'].values)

df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=3
)

train_dataset = GameDataset(
    users = df_train['user_id'].values,
    items = df_train['title'].values,
    hours = df_train['hours'].values,
    
)

valid_dataset = GameDataset(
    users = df_valid['user_id'].values,
    items = df_valid['title'].values,
    hours = df_valid['hours'].values,
)

In [94]:
BATCH_SIZE = 32

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
)

valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
)

In [95]:

recommendation_model = RecSysModel(n_users, n_items, embedding_size=64, hidden_dim=128, dropout_rate=0.1).to(device)

optimizer = torch.optim.Adam(recommendation_model.parameters()) #gradient descent aka adjust to yield smallest error
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

loss_fn = torch.nn.MSELoss()

In [96]:
losses = []

def train():
    epochs = 4
    total_loss = 0
    log_step = 100
    
    print(f'Training on size: {len(train_dataset)}')
    recommendation_model.train()
    
    for epoch_i in range(epochs):
        step_count = 0
        for i, train_data in enumerate(train_loader):
            users = train_data["user_id"].to(device)
            items = train_data["title"].to(device)
    
            output = recommendation_model(users, items)
            output = output.squeeze()
            
            hours = train_data["hours"].to(torch.float32).to(device)
    
            
    
            loss = loss_fn(output, hours)
            total_loss += loss.sum().item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            step_count += len(train_data["user_id"])
    
            if (step_count % log_step == 0 or i == len(train_loader) - 1):
                avg_loss = (total_loss / log_step)
                print(f"epoch {epoch_i} loss at step {step_count} is {avg_loss}")
                losses.append(avg_loss)
                total_loss = 0

if __name__ == '__main__':
    train()

Training on size: 63440
epoch 0 loss at step 800 is 7290.598701171875
epoch 0 loss at step 1600 is 5022.93665222168
epoch 0 loss at step 2400 is 11360.008327789306
epoch 0 loss at step 3200 is 11255.827570037842
epoch 0 loss at step 4000 is 27058.720556945802
epoch 0 loss at step 4800 is 12488.279372558594
epoch 0 loss at step 5600 is 13190.327606277466
epoch 0 loss at step 6400 is 4659.6195458984375
epoch 0 loss at step 7200 is 10582.276041259765
epoch 0 loss at step 8000 is 12286.337452392578
epoch 0 loss at step 8800 is 12890.308620605469
epoch 0 loss at step 9600 is 21748.31109466553
epoch 0 loss at step 10400 is 10554.670635375976
epoch 0 loss at step 11200 is 7806.033176269531
epoch 0 loss at step 12000 is 16737.047279052735
epoch 0 loss at step 12800 is 8705.542124023437
epoch 0 loss at step 13600 is 28647.48538696289
epoch 0 loss at step 14400 is 9877.084731445313
epoch 0 loss at step 15200 is 24846.773060302734
epoch 0 loss at step 16000 is 16537.170620117187
epoch 0 loss at s

In [97]:
  # Root Mean Squared Error
from sklearn.metrics import root_mean_squared_error

y_true = []
y_pred = []

recommendation_model.eval()

with torch.no_grad():
    for i, valid_data in enumerate(valid_loader):
        model_output = recommendation_model(valid_data['user_id'].to(device), valid_data['title'].to(device))

        hours = valid_data['hours'].to(device)
        y_true.extend(hours.cpu().numpy()) 
        y_pred.extend(model_output.cpu().numpy())


# actually calc RMSE
rmse = root_mean_squared_error(y_true, y_pred)
print(f"RMSE: {rmse:.4f}")

RMSE: 198.7710


In [98]:
from collections import defaultdict


def calculate_precision_recall(user_hours, k, threshold):
    user_hours.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_hours)
    n_rec_k = sum(est >= threshold for est, _ in user_hours[:k])
    n_rel_and_rec_k = sum(
        (true_r >= threshold) and (est >= threshold) for est, true_r in user_hours[:k]
    )

    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall


user_hours_comparison = defaultdict(list)

with torch.no_grad():
    for valid_data in valid_loader:
        users = valid_data["user_id"].to(device)
        titles = valid_data["title"].to(device)
        hours = valid_data["hours"].to(device)
        output = recommendation_model(users, titles)

        for user, pred, true in zip(users, output, hours):
            user_hours_comparison[user.item()].append((pred[0].item(), true.item()))

user_precisions = dict()
user_based_recalls = dict()

k = 50
threshold = 3

for user_id, user_hours in user_hours_comparison.items():
    precision, recall = calculate_precision_recall(user_hours, k, threshold)
    user_precisions[user_id] = precision
    user_based_recalls[user_id] = recall


    average_precision = sum(prec for prec in user_precisions.values()) / len(
    user_precisions
)
average_recall = sum(rec for rec in user_based_recalls.values()) / len(
    user_based_recalls
)

print(f"precision @ {k}: {average_precision:.4f}")
print(f"recall @ {k}: {average_recall:.4f}")

precision @ 50: 0.6815
recall @ 50: 0.8595


In [104]:
def top_recommendations(user_id, all_titles, k=5, batch_size=100):
    recommendation_model.eval()


    
    played_titles = set(df[df['user_id'] == user_id]['title'].tolist())
    unplayed_titles = [m for m in all_titles if m not in played_titles]
    # fill unwatched movies
    # for m in all_movies:
    #     if m not in watched_movies:
    #         unwatched_movies.append(m)

    prediction = []
    top_k_recommendations = []

    with torch.no_grad():
        for i in range(0, len(unplayed_titles), batch_size):
            batched_unwatched = unplayed_titles[i:i+batch_size]
            title_tensor = torch.tensor(batched_unwatched).to(device)
            user_tensor = torch.tensor([user_id] * len(batched_unwatched)).to(device)
            prediction_model = recommendation_model(user_tensor, title_tensor).view(-1).tolist()
            prediction.extend(zip(batched_unwatched, prediction_model))

    prediction.sort(key=lambda x: x[1], reverse=True)

    for (m_id, _) in prediction[:k]:
        top_k_recommendations.append(m_id)

    # Convert this encoded movieId's back to their original ids
    top_k_recommendations = lbl_item.inverse_transform(top_k_recommendations)
    
    return top_k_recommendations

# ---------------

all_games = df['title'].unique().tolist()
user_id = 1

recommendations = top_recommendations(user_id, all_games, k=5)
print(f"Recommendations for user {user_id}: {recommendations}")

# for i in recommendations:
#     print(movies_dict[i])

user_id = 4
recommendations = top_recommendations(user_id, all_games, k=5)
print(f"Recommendations for user {user_id}: {recommendations}")


Recommendations for user 1: ['Football Manager 2012' 'Football Manager 2014' 'Football Manager 2013'
 'Football Manager 2015' 'Football Manager 2011']
Recommendations for user 4: ['Football Manager 2012' 'Football Manager 2014' 'Football Manager 2013'
 'Football Manager 2015' 'Counter-Strike Global Offensive']


In [54]:
print(df[['user_id', 'title']].head())


   user_id  title
0     5088   3067
2     5088   1162
4     5088   2813
6     5088   1163
8     5088   1733


In [55]:
df

Unnamed: 0,user_id,title,action,hours
0,5088,3067,play,273.0
2,5088,1162,play,87.0
4,5088,2813,play,14.9
6,5088,1163,play,12.1
8,5088,1733,play,8.9
...,...,...,...,...
199990,4056,1155,play,2.4
199992,4056,1833,play,2.2
199994,4056,3220,play,1.5
199996,4056,1375,play,1.5


In [58]:
df['user_id'].nunique

<bound method IndexOpsMixin.nunique of 0         5088
2         5088
4         5088
6         5088
8         5088
          ... 
199990    4056
199992    4056
199994    4056
199996    4056
199998    4056
Name: user_id, Length: 70489, dtype: int64>