Importing the necessary libraries:

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats.mstats import winsorize



Loading data:

In [2]:
dataset_foldername = "~/OneDrive - Università degli Studi di Milano-Bicocca/Magistrale/AI/cleaned_datasets_students"

In [3]:
train_interactions = pd.read_csv(f"{dataset_foldername}/train_interactions.csv")
games = pd.read_csv(f"{dataset_foldername}/games.csv")
test_interactions_in = pd.read_csv(f"{dataset_foldername}/test_interactions_in.csv")

STUDY OF THE DATASETS

Brief overview of the datasets

In [4]:
train_interactions.head()
#it describes the relation beetween the users and the games they played 

Unnamed: 0,user_id,item_id,item_name,playtime
0,0,0,Counter-Strike,6
1,0,2555,Day of Defeat,7
2,0,2556,Day of Defeat: Source,4733
3,0,1043,Counter-Strike: Source,1853
4,0,5335,Psychonauts,333


In [5]:
games.head()
#it contains informations about the games

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
0,0,Counter-Strike,Valve,['Action'],http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
1,1,Rag Doll Kung Fu,Mark Healey,['Indie'],http://store.steampowered.com/app/1002/Rag_Dol...,"['Indie', 'Fighting', 'Multiplayer']",Mixed,69.0,"['Single-player', 'Multi-player']",9.99,2005-10-12
2,2,Silo 2,Nevercenter Ltd. Co.,['Animation &amp; Modeling'],http://store.steampowered.com/app/100400/Silo_2/,"['Animation & Modeling', 'Software']",Mostly Positive,,,99.99,2012-12-19
3,3,Call of Duty: World at War,Activision,['Action'],http://store.steampowered.com/app/10090/Call_o...,"['Zombies', 'World War II', 'FPS', 'Action', '...",Very Positive,83.0,"['Single-player', 'Multi-player', 'Co-op']",19.99,2008-11-18
4,4,3D-Coat V4.8,Pilgway,['Animation &amp; Modeling'],http://store.steampowered.com/app/100980/3DCoa...,['Animation & Modeling'],Very Positive,,['Steam Cloud'],99.99,2012-10-02


In [6]:
# study of the types of variables of the datasets 
train_interactions.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2293985 entries, 0 to 2293984
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    2293985 non-null  int64 
 1   item_id    2293985 non-null  int64 
 2   item_name  2293985 non-null  object
 3   playtime   2293985 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 70.0+ MB


Context of the datasets

Preprocessing

In [7]:
print(games['genres'].head())
print("")
print(games['tags'].head())

0                      ['Action']
1                       ['Indie']
2    ['Animation &amp; Modeling']
3                      ['Action']
4    ['Animation &amp; Modeling']
Name: genres, dtype: object

0    ['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...
1                 ['Indie', 'Fighting', 'Multiplayer']
2                 ['Animation & Modeling', 'Software']
3    ['Zombies', 'World War II', 'FPS', 'Action', '...
4                             ['Animation & Modeling']
Name: tags, dtype: object


## games.csv processing

In [8]:
# Checking for NA tags
games[games['tags'].isna()]

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
113,113,LUMINES™ Advance Pack,Q Entertainment Inc.,['Casual'],http://store.steampowered.com/app/11920/LUMINE...,,9 user reviews,,['Single-player'],7.99,2008-04-18
5029,5029,Wedding Dash® 2: Rings Around the World,PlayFirst,['Simulation'],http://store.steampowered.com/app/37280/Weddin...,,Positive,,['Single-player'],9.99,2009-08-12
5033,5033,Zenerchi®,PlayFirst,['Casual'],http://store.steampowered.com/app/37290/Zenerchi/,,5 user reviews,,['Single-player'],9.99,2009-08-12
5060,5060,Mahjong Roadshow™,PlayFirst,['Casual'],http://store.steampowered.com/app/37360/Mahjon...,,5 user reviews,,['Single-player'],6.99,2007-10-19


In [9]:
# Filling the NA value of the tags with the corresponding genres value
# Then setting the tags as my new genres.
games['tags'] = games['tags'].fillna("genres")
games['genres'] = games['tags']

# Processing the genres list and exploding the item genres (conversion to long format)
games_long = games.copy()
games_long['genres'] = games_long['genres'].str.replace(r"[\[\]']", '', regex=True)
games_long['genres'] = games_long['genres'].str.split(r',\s*')
games_long = games_long.explode('genres').reset_index(drop=True)

games_long

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date
0,0,Counter-Strike,Valve,Action,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
1,0,Counter-Strike,Valve,FPS,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
2,0,Counter-Strike,Valve,Multiplayer,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
3,0,Counter-Strike,Valve,Shooter,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
4,0,Counter-Strike,Valve,Classic,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01
...,...,...,...,...,...,...,...,...,...,...,...
75278,8522,Puzzle Pirates,Three Rings,Adventure,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75279,8522,Puzzle Pirates,Three Rings,Strategy,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75280,8522,Puzzle Pirates,Three Rings,Multiplayer,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31
75281,8522,Puzzle Pirates,Three Rings,Co-op,http://store.steampowered.com/app/99910/Puzzle...,"['Free to Play', 'Massively Multiplayer', 'Puz...",Very Positive,,"['Single-player', 'Multi-player', 'MMO', 'Co-o...",Free to Play,2011-08-31


## interaction.csv processing

In [10]:
# Merging the 2 interactions dataframe into a single interactions file
all_interactions = pd.concat([test_interactions_in, train_interactions])

# Sorting the interactions based on user_id and playtime and removing any duplicate entries
all_interactions = (
    all_interactions
    .sort_values(by = ['user_id', 'playtime'], ascending = [True, False])
    .drop_duplicates(subset = ['user_id', 'item_id'])
)

all_interactions.head()

Unnamed: 0,user_id,item_id,item_name,playtime
100,0,8363,Counter-Strike: Global Offensive,23532
85,0,4356,Rising Storm/Red Orchestra 2 Multiplayer,14194
60,0,8429,Sid Meier's Civilization V,10345
28,0,140,Killing Floor,10006
161,0,884,Killing Floor 2,6494


In [11]:
# Left joining the user interactions with the games
user_genre_playtime = all_interactions.merge(games_long, on = 'item_id', how = 'left')

# Winsorizing the playtime interactions of each item_id to help reduce the amount of outliers
# Then calculating the mean playtime of each item_id
mean_item_playtime = (
    user_genre_playtime
    .assign(playtime_winsorized = user_genre_playtime.groupby('item_id')['playtime']
            .transform(lambda x: winsorize(x, limits = (0.05, 0.05))))
    .groupby('item_id', as_index = False)
    .agg(playtime=('playtime_winsorized', 'mean'))
)

del(user_genre_playtime)

mean_item_playtime.head()

Unnamed: 0,item_id,playtime
0,0,1108.850488
1,1,33.695652
2,2,543.083333
3,3,1775.757712
4,4,668.823529


In [12]:
# Merging the games with the mean playtime of each item
games_long = games_long.merge(mean_item_playtime, on = "item_id", how = "left").sort_values('playtime', ascending = False)
games_long.head()

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date,playtime
14803,1257,MovieWriterPro,The Writer Zone,Utilities,http://store.steampowered.com/app/250360/Movie...,"['Utilities', 'Video Production']",Mixed,,,29.99,2013-09-28,38283.0
14804,1257,MovieWriterPro,The Writer Zone,Video Production,http://store.steampowered.com/app/250360/Movie...,"['Utilities', 'Video Production']",Mixed,,,29.99,2013-09-28,38283.0
68437,7664,Major\Minor - Complete Edition,Tall Tail Studios,Casual,http://store.steampowered.com/app/475490/Major...,"['Visual Novel', 'Indie', 'Adventure', 'RPG', ...",Very Positive,,"['Single-player', 'Steam Achievements', 'Steam...",19.99,2016-10-11,33759.0
68433,7664,Major\Minor - Complete Edition,Tall Tail Studios,RPG,http://store.steampowered.com/app/475490/Major...,"['Visual Novel', 'Indie', 'Adventure', 'RPG', ...",Very Positive,,"['Single-player', 'Steam Achievements', 'Steam...",19.99,2016-10-11,33759.0
68435,7664,Major\Minor - Complete Edition,Tall Tail Studios,Singleplayer,http://store.steampowered.com/app/475490/Major...,"['Visual Novel', 'Indie', 'Adventure', 'RPG', ...",Very Positive,,"['Single-player', 'Steam Achievements', 'Steam...",19.99,2016-10-11,33759.0


In [13]:
# Extracting the unique item_id with the corresponding playtime
games_unique = games_long.groupby('item_id', as_index = False).first()
games_unique.head()

Unnamed: 0,item_id,item_name,publisher,genres,url,tags,sentiment,metascore,specs,price,release_date,playtime
0,0,Counter-Strike,Valve,Action,http://store.steampowered.com/app/10/CounterSt...,"['Action', 'FPS', 'Multiplayer', 'Shooter', 'C...",Overwhelmingly Positive,88.0,"['Multi-player', 'Valve Anti-Cheat enabled']",9.99,2000-11-01,1108.850488
1,1,Rag Doll Kung Fu,Mark Healey,Fighting,http://store.steampowered.com/app/1002/Rag_Dol...,"['Indie', 'Fighting', 'Multiplayer']",Mixed,69.0,"['Single-player', 'Multi-player']",9.99,2005-10-12,33.695652
2,2,Silo 2,Nevercenter Ltd. Co.,Animation & Modeling,http://store.steampowered.com/app/100400/Silo_2/,"['Animation & Modeling', 'Software']",Mostly Positive,,,99.99,2012-12-19,543.083333
3,3,Call of Duty: World at War,Activision,Survival,http://store.steampowered.com/app/10090/Call_o...,"['Zombies', 'World War II', 'FPS', 'Action', '...",Very Positive,83.0,"['Single-player', 'Multi-player', 'Co-op']",19.99,2008-11-18,1775.757712
4,4,3D-Coat V4.8,Pilgway,Animation & Modeling,http://store.steampowered.com/app/100980/3DCoa...,['Animation & Modeling'],Very Positive,,['Steam Cloud'],99.99,2012-10-02,668.823529


In [14]:
# Converting the interactions data from long to a wide format
all_interactions_wide = all_interactions.pivot(index = 'user_id', 
                                               columns = 'item_id', 
                                               values = 'playtime')

# Applying log transformation to the data to reduce the effect of larger playtime values
# Filling the missing interactions with 0
all_interactions_wide = np.log1p(all_interactions_wide)
all_interactions_wide = all_interactions_wide.fillna(0)
all_interactions_wide = all_interactions_wide.reindex(columns = games['item_id'].unique(), 
                                                      fill_value = 0)

all_interactions_wide.head()

item_id,0,1,2,3,4,5,6,7,8,9,...,8513,8514,8515,8516,8517,8518,8519,8520,8521,8522
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.94591,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.258097,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.4161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,6.993933,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.828641,0.0


## Recommendation

In [15]:
import importlib
import Components.item_knn as gr
importlib.reload(gr)

<module 'Components.item_knn' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\item_knn.py'>

In [None]:

from Components.my_cosine_similarity import my_cosine_similarity
import scipy.sparse as sp
from scipy.sparse import csr_matrix
import torch
from Components.item_knn import item_knn_scores, scores2recommendations, save_user_item

train_unique = train_interactions.drop_duplicates(["user_id", "item_id"])
test_in_unique = test_interactions_in.drop_duplicates(["user_id", "item_id"])

#user with >=5 interactions
train_unique = train_unique.groupby("user_id").filter(lambda x: len(x) >= 5)

# items with >=2 interaction
item_freq = train_unique.groupby("item_id").size()
valid_items = item_freq[item_freq >= 2].index

train_unique = train_unique[train_unique["item_id"].isin(valid_items)]

test_in_unique = test_in_unique[test_in_unique["item_id"].isin(valid_items)]

train_unique = train_unique.groupby("user_id").filter(lambda x: len(x) > 0)

# --- USERS AND ITEMS ---
train_users = train_unique["user_id"].unique()
test_users = test_in_unique["user_id"].unique()
all_users = np.unique(np.concatenate([train_users, test_users]))

user_to_index = {u: i for i, u in enumerate(all_users)}
index_to_user = {i: u for i, u in enumerate(all_users)}

train_unique["uid"] = train_unique["user_id"].map(user_to_index)

all_items = train_unique["item_id"].unique()
n_items = all_items.max() + 1

n_users_train = len(all_users) 



X_train_binary = sp.csr_matrix(
    (np.ones(len(train_unique)),
     (train_unique["uid"], train_unique["item_id"])),
    shape=(n_users_train, n_items)
)

X_dense_train = torch.FloatTensor(X_train_binary.toarray())
row_sums = X_dense_train.sum(dim=1, keepdim=True)
X_dense_train = X_dense_train / torch.clamp(row_sums, min=1.0)


# Specifying number of Top-n similar users
n_similar_users = [1, 5, 10, 20]

#X_sparse = csr_matrix(all_interactions_wide.values)
# check whether similarities match expected output

#S = my_cosine_similarity(X_sparse)
# scores = item_knn_scores(X_train_binary, X_test_in_binary, 50)
# df_recos = scores2recommendations(scores, X_test_in_binary, 20)
# save_user_item(df_recos, "submission_itemknn.csv")


In [27]:
import importlib
import Components.generate_recommendations as gr
importlib.reload(gr)


<module 'Components.generate_recommendations' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\generate_recommendations.py'>

In [None]:
import numpy as np
import scipy.sparse as sp
import torch
from Components.multiVAE import MultiVAE

# ============================================================
# INITIALIZE MODEL
# ============================================================

p_dims = [600, 200, n_items]
model = MultiVAE(p_dims, dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ============================================================
# TRAINING LOOP
# ============================================================

epochs = 30
batch_size = 2000

total_anneal_steps = 200000   # recommended by the paper
anneal_cap = 1.0              # max value for beta
update_count = 0              # global step counter

for epoch in range(epochs):
    perm = torch.randperm(n_users_train)
    epoch_loss = 0.0

    for start in range(0, n_users_train, batch_size):
        end = start + batch_size
        batch_idx = perm[start:end]
        batch = X_dense_train[batch_idx]

        # ===== KL annealing =====
        if total_anneal_steps > 0:
            beta = min(anneal_cap, update_count / total_anneal_steps)
        else:
            beta = anneal_cap

        logits, mu, logvar = model(batch)
        loss, _, _ = model.loss_function(logits, batch, mu, logvar, beta)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        update_count += 1

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f} - Beta: {beta:.4f}")


Epoch 1/30 - Loss: 182.5444 - Beta: 0.0001
Epoch 2/30 - Loss: 155.6674 - Beta: 0.0003
Epoch 3/30 - Loss: 155.0555 - Beta: 0.0005
Epoch 4/30 - Loss: 154.9071 - Beta: 0.0006
Epoch 5/30 - Loss: 154.6854 - Beta: 0.0008
Epoch 6/30 - Loss: 153.6220 - Beta: 0.0009
Epoch 7/30 - Loss: 151.5659 - Beta: 0.0011
Epoch 8/30 - Loss: 151.2145 - Beta: 0.0012
Epoch 9/30 - Loss: 150.8330 - Beta: 0.0014
Epoch 10/30 - Loss: 150.8365 - Beta: 0.0015
Epoch 11/30 - Loss: 150.7717 - Beta: 0.0017
Epoch 12/30 - Loss: 150.8652 - Beta: 0.0019
Epoch 13/30 - Loss: 150.3333 - Beta: 0.0020
Epoch 14/30 - Loss: 149.6371 - Beta: 0.0022
Epoch 15/30 - Loss: 149.2683 - Beta: 0.0023
Epoch 16/30 - Loss: 148.4306 - Beta: 0.0025
Epoch 17/30 - Loss: 148.2970 - Beta: 0.0026
Epoch 18/30 - Loss: 148.0488 - Beta: 0.0028
Epoch 19/30 - Loss: 147.4566 - Beta: 0.0029
Epoch 20/30 - Loss: 147.2756 - Beta: 0.0031
Epoch 21/30 - Loss: 147.1206 - Beta: 0.0032
Epoch 22/30 - Loss: 146.9550 - Beta: 0.0034
Epoch 23/30 - Loss: 146.4714 - Beta: 0.00

In [45]:
import importlib
import Components.generate_recommendations as gr
importlib.reload(gr)


<module 'Components.generate_recommendations' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\generate_recommendations.py'>

In [49]:
from Components.generate_recommendations import multivae_recommend, save_submission

test_in_unique["uid"] = test_in_unique["user_id"].map(user_to_index)

X_test_in_binary = sp.csr_matrix(
    (np.ones(len(test_in_unique)),
     (test_in_unique["uid"], test_in_unique["item_id"])),
    shape=(n_users_train, n_items)
)

X_dense_test_in = torch.FloatTensor(X_test_in_binary.toarray())

# normalization (same as train)
row_sums_test = X_dense_test_in.sum(1, keepdim=True)
X_dense_test_in = X_dense_test_in / torch.clamp(row_sums_test, min=1.0)

# ============================================================
# 6) BUILD known_items DICTIONARY (train + test_in)
# ============================================================

known_items = {}


# --- train users ---
for row in train_unique.itertuples():
    u = row.uid
    known_items.setdefault(u, set()).add(row.item_id)

# --- test users ---
for row in test_in_unique.itertuples():
    u = row.uid
    known_items.setdefault(u, set()).add(row.item_id)

# convert sets → lists
known_items = {u: list(items) for u, items in known_items.items()}


# ============================================================
# 7) RECOMMENDATIONS WITH MULTIVAE
# ============================================================

rec_df = multivae_recommend(
    model=model,
    X_dense_test_in=X_dense_test_in, 
    index_to_user=index_to_user,
    known_items=known_items,
    top_k=20
)


# ============================================================
# 8) SAVE SUBMISSION FILE
# ============================================================

save_submission(rec_df, "submission_multivae.csv")
print("MultiVAE recommendations saved to submission_multivae.csv")


File saved to submission_multivae.csv
MultiVAE recommendations saved to submission_multivae.csv
