Importing the necessary libraries:

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats.mstats import winsorize



Loading data:

In [2]:
dataset_foldername = "~/OneDrive - Università degli Studi di Milano-Bicocca/Magistrale/AI/cleaned_datasets_students"

In [3]:
train_interactions = pd.read_csv(f"{dataset_foldername}/train_interactions.csv")
#bundles = pd.read_csv(f"{dataset_foldername}/bundles.csv")
#extended_games = pd.read_csv(f"{dataset_foldername}/extended_games.csv")
games = pd.read_csv(f"{dataset_foldername}/games.csv")
#item_reviews = pd.read_csv(f"{dataset_foldername}/item_reviews.csv")
#user_reviews = pd.read_csv(f"{dataset_foldername}/user_reviews.csv")
test_interactions_in = pd.read_csv(f"{dataset_foldername}/test_interactions_in.csv")

STUDY OF THE DATASETS

Brief overview of the datasets

In [4]:
train_interactions.head()
#it describes the relation beetween the users and the games they played 

Unnamed: 0,user_id,item_id,item_name,playtime
0,0,0,Counter-Strike,6
1,0,2555,Day of Defeat,7
2,0,2556,Day of Defeat: Source,4733
3,0,1043,Counter-Strike: Source,1853
4,0,5335,Psychonauts,333


In [5]:
games.head()
#it contains informations about the games

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
0,0,Counter-Strike,Valve,['Action'],http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
1,1,Rag Doll Kung Fu,Mark Healey,['Indie'],http://store.steampowered.com/app/1002/Rag_Dol...,"['Indie', 'Fighting', 'Multiplayer']",Mixed,69.0,"['Single-player', 'Multi-player']",9.99,2005-10-12
2,2,Silo 2,Nevercenter Ltd. Co.,['Animation &amp; Modeling'],http://store.steampowered.com/app/100400/Silo_2/,"['Animation & Modeling', 'Software']",Mostly Positive,,,99.99,2012-12-19
3,3,Call of Duty: World at War,Activision,['Action'],http://store.steampowered.com/app/10090/Call_o...,"['Zombies', 'World War II', 'FPS', 'Action', '...",Very Positive,83.0,"['Single-player', 'Multi-player', 'Co-op']",19.99,2008-11-18
4,4,3D-Coat V4.8,Pilgway,['Animation &amp; Modeling'],http://store.steampowered.com/app/100980/3DCoa...,['Animation & Modeling'],Very Positive,,['Steam Cloud'],99.99,2012-10-02


In [6]:
# study of the types of variables of the datasets 
train_interactions.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2293985 entries, 0 to 2293984
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    2293985 non-null  int64 
 1   item_id    2293985 non-null  int64 
 2   item_name  2293985 non-null  object
 3   playtime   2293985 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 70.0+ MB


Context of the datasets

Preprocessing

In [7]:
print(games['genres'].head())
print("")
print(games['tags'].head())

0                      ['Action']
1                       ['Indie']
2    ['Animation &amp; Modeling']
3                      ['Action']
4    ['Animation &amp; Modeling']
Name: genres, dtype: object

0    ['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...
1                 ['Indie', 'Fighting', 'Multiplayer']
2                 ['Animation & Modeling', 'Software']
3    ['Zombies', 'World War II', 'FPS', 'Action', '...
4                             ['Animation & Modeling']
Name: tags, dtype: object


## games.csv processing

In [8]:
# Checking for NA tags
games[games['tags'].isna()]

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
113,113,LUMINES™ Advance Pack,Q Entertainment Inc.,['Casual'],http://store.steampowered.com/app/11920/LUMINE...,,9 user reviews,,['Single-player'],7.99,2008-04-18
5029,5029,Wedding Dash® 2: Rings Around the World,PlayFirst,['Simulation'],http://store.steampowered.com/app/37280/Weddin...,,Positive,,['Single-player'],9.99,2009-08-12
5033,5033,Zenerchi®,PlayFirst,['Casual'],http://store.steampowered.com/app/37290/Zenerchi/,,5 user reviews,,['Single-player'],9.99,2009-08-12
5060,5060,Mahjong Roadshow™,PlayFirst,['Casual'],http://store.steampowered.com/app/37360/Mahjon...,,5 user reviews,,['Single-player'],6.99,2007-10-19


In [9]:
# Filling the NA value of the tags with the corresponding genres value
# Then setting the tags as my new genres.
games['tags'] = games['tags'].fillna("genres")
games['genres'] = games['tags']

# Processing the genres list and exploding the item genres (conversion to long format)
games_long = games.copy()
games_long['genres'] = games_long['genres'].str.replace(r"[\[\]']", '', regex=True)
games_long['genres'] = games_long['genres'].str.split(r',\s*')
games_long = games_long.explode('genres').reset_index(drop=True)

games_long

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
0,0,Counter-Strike,Valve,Action,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
1,0,Counter-Strike,Valve,FPS,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
2,0,Counter-Strike,Valve,Multiplayer,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
3,0,Counter-Strike,Valve,Shooter,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
4,0,Counter-Strike,Valve,Classic,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
...,...,...,...,...,...,...,...,...,...,...,...
75278,8522,Puzzle Pirates,Three Rings,Adventure,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75279,8522,Puzzle Pirates,Three Rings,Strategy,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75280,8522,Puzzle Pirates,Three Rings,Multiplayer,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75281,8522,Puzzle Pirates,Three Rings,Co-op,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31


## train_interaction.csv processing

In [10]:
# Sorting the interactions based on user_id and playtime and removing any duplicate entries
train_interactions = (
    train_interactions
    .sort_values(by = ['user_id', 'playtime'], ascending = [True, False])
    .drop_duplicates(subset = ['user_id', 'item_id'])
)

train_interactions.head()

Unnamed: 0,user_id,item_id,item_name,playtime
100,0,8363,Counter-Strike: Global Offensive,23532
85,0,4356,Rising Storm/Red Orchestra 2 Multiplayer,14194
60,0,8429,Sid Meier's Civilization V,10345
28,0,140,Killing Floor,10006
161,0,884,Killing Floor 2,6494


In [11]:
# Left joining the user interactions with the games
user_genre_playtime = train_interactions.merge(games_long, on = 'item_id', how = 'left')

# Winsorizing the playtime interactions of each item_id to help reduce the amount of outliers
# Then calculating the mean playtime of each item_id
mean_item_playtime = (
    user_genre_playtime
    .assign(playtime_winsorized = user_genre_playtime.groupby('item_id')['playtime']
            .transform(lambda x: winsorize(x, limits = (0.05, 0.05))))
    .groupby('item_id', as_index = False)
    .agg(playtime=('playtime_winsorized', 'mean'))
)

del(user_genre_playtime)

mean_item_playtime.head()

Unnamed: 0,item_id,playtime
0,0,1238.309648
1,1,37.631579
2,2,505.909091
3,3,1792.74392
4,4,607.6875


In [12]:
# Merging the games with the mean playtime of each item
games_long = games_long.merge(mean_item_playtime, on = "item_id", how = "left").sort_values('playtime', ascending = False)
games_long.head()

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date,playtime
14803,1257,MovieWriterPro,The Writer Zone,Utilities,http://store.steampowered.com/app/250360/Movie...,"['Utilities', 'Video Production']",Mixed,,,29.99,2013-09-28,76564.0
14804,1257,MovieWriterPro,The Writer Zone,Video Production,http://store.steampowered.com/app/250360/Movie...,"['Utilities', 'Video Production']",Mixed,,,29.99,2013-09-28,76564.0
37005,3681,Elastrix,Hyper Hippo Games,Puzzle,http://store.steampowered.com/app/336700/Elast...,"['Casual', 'Indie', 'Puzzle', 'Physics']",Positive,,"['Single-player', 'Steam Achievements', 'Steam...",2.99,2014-12-18,37226.5
37006,3681,Elastrix,Hyper Hippo Games,Physics,http://store.steampowered.com/app/336700/Elast...,"['Casual', 'Indie', 'Puzzle', 'Physics']",Positive,,"['Single-player', 'Steam Achievements', 'Steam...",2.99,2014-12-18,37226.5
37004,3681,Elastrix,Hyper Hippo Games,Indie,http://store.steampowered.com/app/336700/Elast...,"['Casual', 'Indie', 'Puzzle', 'Physics']",Positive,,"['Single-player', 'Steam Achievements', 'Steam...",2.99,2014-12-18,37226.5


In [13]:
# Extracting the unique item_id with the corresponding playtime
games_unique = games_long.groupby('item_id', as_index = False).first()
games_unique.head()

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date,playtime
0,0,Counter-Strike,Valve,FPS,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01,1238.309648
1,1,Rag Doll Kung Fu,Mark Healey,Multiplayer,http://store.steampowered.com/app/1002/Rag_Dol...,"['Indie', 'Fighting', 'Multiplayer']",Mixed,69.0,"['Single-player', 'Multi-player']",9.99,2005-10-12,37.631579
2,2,Silo 2,Nevercenter Ltd. Co.,Animation & Modeling,http://store.steampowered.com/app/100400/Silo_2/,"['Animation & Modeling', 'Software']",Mostly Positive,,,99.99,2012-12-19,505.909091
3,3,Call of Duty: World at War,Activision,Moddable,http://store.steampowered.com/app/10090/Call_o...,"['Zombies', 'World War II', 'FPS', 'Action', '...",Very Positive,83.0,"['Single-player', 'Multi-player', 'Co-op']",19.99,2008-11-18,1792.74392
4,4,3D-Coat V4.8,Pilgway,Animation & Modeling,http://store.steampowered.com/app/100980/3DCoa...,['Animation & Modeling'],Very Positive,,['Steam Cloud'],99.99,2012-10-02,607.6875


In [14]:
# Converting the interactions data from long to a wide format
train_interactions_wide = train_interactions.pivot(index = 'user_id', 
                                               columns = 'item_id', 
                                               values = 'playtime')

# Applying log transformation to the data to reduce the effect of larger playtime values
# Filling the missing interactions with 0
train_interactions_wide = np.log1p(train_interactions_wide)
train_interactions_wide = train_interactions_wide.fillna(0)
train_interactions_wide = train_interactions_wide.reindex(columns = games['item_id'].unique(), 
                                                      fill_value = 0)

train_interactions_wide.head()

item_id,0,1,2,3,4,5,6,7,8,9,...,8513,8514,8515,8516,8517,8518,8519,8520,8521,8522
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.94591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.258097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.4161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.469682,0.0


## Recommendation

In [15]:

from Components.my_cosine_similarity import my_cosine_similarity
import scipy as sp
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
# Specifying the user ids that are going to receive recommendations
user_ids = test_interactions_in['user_id'].unique()

# Specifying number of Top-n similar users
n_similar_users = [1, 5, 10, 20]

X_sparse = csr_matrix(train_interactions_wide.values)
# check whether similarities match expected output

#S = my_cosine_similarity(X_sparse)


In [16]:

S = my_cosine_similarity(X_sparse)

In [19]:
#from Components.multiVAE import train_VAE
from Components.multiVAE import MultiVAE
import torch
import torch.nn as nn
import torch.nn.functional as F
num_users, num_items = X_sparse.shape

# CONVERSIONE UNA VOLTA SOLA (niente toarray nei batch)
X_dense = torch.FloatTensor(X_sparse.toarray())

# ===== CREA IL MODELLO =====
# modello PIÙ PICCOLO = PIÙ VELOCE
p_dims = [300, 100, num_items]
model = MultiVAE(p_dims)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ===== TRAIN =====
epochs = 30
batch_size = 2000      # molto più veloce
beta = 1.0

for epoch in range(epochs):
    perm = torch.randperm(num_users)
    epoch_loss = 0

    for start in range(0, num_users, batch_size):
        end = start + batch_size
        batch_idx = perm[start:end]

        batch = X_dense[batch_idx]

        logits, mu, logvar = model(batch)
        loss, _, _ = model.loss_function(logits, batch, mu, logvar, beta)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}")

Epoch 1/30 - Loss: 51588.9442
Epoch 2/30 - Loss: 42473.9940
Epoch 3/30 - Loss: 41717.3953
Epoch 4/30 - Loss: 41373.1057
Epoch 5/30 - Loss: 41632.5328
Epoch 6/30 - Loss: 41443.7450
Epoch 7/30 - Loss: 41183.1539
Epoch 8/30 - Loss: 41548.0759


KeyboardInterrupt: 