Importing the necessary libraries:

In [71]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats.mstats import winsorize



Loading data:

In [72]:
dataset_foldername = "~/OneDrive - Università degli Studi di Milano-Bicocca/Magistrale/AI/cleaned_datasets_students"

In [73]:
train_interactions = pd.read_csv(f"{dataset_foldername}/train_interactions.csv")
games = pd.read_csv(f"{dataset_foldername}/games.csv")
test_interactions_in = pd.read_csv(f"{dataset_foldername}/test_interactions_in.csv")

STUDY OF THE DATASETS

Brief overview of the datasets

In [102]:
test_interactions_in.describe()
#it describes the relation beetween the users and the games they played 

Unnamed: 0,user_id,item_id,playtime
count,448211.0,448211.0,448211.0
mean,25832.526716,2992.618144,1539.262254
std,17466.683483,2879.200191,6709.528824
min,4.0,0.0,1.0
25%,11060.0,677.0,45.0
50%,23878.0,1637.0,205.0
75%,37628.0,5238.0,796.0
max,68400.0,8522.0,501498.0


Context of the datasets

Preprocessing

## interaction.csv processing

In [75]:
train_filtered = train_interactions.drop_duplicates(["user_id", "item_id"])

# User with >=5 interactions
train_filtered = train_filtered.groupby("user_id").filter(lambda x: len(x) >= 5)

# Items with >=2 interactions
item_freq = train_filtered.groupby("item_id").size()
valid_items = item_freq[item_freq >= 2].index
train_filtered = train_filtered[train_filtered["item_id"].isin(valid_items)]


train_filtered["split"] = "train"
test_interactions_in["split"] = "test"

all_interactions = pd.concat([train_filtered, test_interactions_in], ignore_index=True)


all_interactions = all_interactions.rename(
    columns={
        "user_id": "old_user_id",
        "item_id": "old_item_id"
    }
)
# --- USERS ---
user_id_mapping = {old_id: new_id for new_id, old_id in enumerate(all_interactions['old_user_id'].unique())}
all_interactions['user_id'] = all_interactions['old_user_id'].map(user_id_mapping)
new_to_old_user_id_mapping = {v: k for k, v in user_id_mapping.items()}

# --- ITEMS ---
item_id_mapping = {old_id: new_id for new_id, old_id in enumerate(all_interactions['old_item_id'].unique())}
all_interactions['item_id'] = all_interactions['old_item_id'].map(item_id_mapping)
new_to_old_item_id_mapping = {v: k for k, v in item_id_mapping.items()}


In [76]:
test_mapped  = all_interactions[all_interactions["split"] == "test"].copy()
train_mapped = all_interactions[all_interactions["split"] == "train"].copy()

item_freq = train_mapped.groupby("item_id").size()
valid_items = set(item_freq[item_freq >= 2].index)

train_mapped = train_mapped[train_mapped["item_id"].isin(valid_items)]
test_mapped = test_mapped[test_mapped["item_id"].isin(valid_items)]


In [77]:
num_users = train_mapped["user_id"].nunique()
print(num_users)

46713


## Recommendation

In [78]:
import importlib
import Components.item_knn as gr
importlib.reload(gr)

<module 'Components.item_knn' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\item_knn.py'>

In [None]:

from Components.my_cosine_similarity import my_cosine_similarity
import scipy.sparse as sp
from scipy.sparse import csr_matrix
import torch
from Components.item_knn import item_knn_scores, scores2recommendations, save_user_item

num_users = all_interactions["user_id"].nunique()
num_items = len(valid_items)

X_train_binary = sp.csr_matrix(
    (np.ones(len(train_mapped)),
     (train_mapped["user_id"].values, train_mapped["item_id"].values)),
    shape=(num_users, num_items)
)


X_test_in_binary = sp.csr_matrix(
    (np.ones(len(test_mapped)),
     (test_mapped["user_id"].values, test_mapped["item_id"].values)),
    shape=(num_users, num_items)
)





# scores = item_knn_scores(X_train_binary, X_test_in_binary, 50)
# df_recos = scores2recommendations(scores, X_test_in_binary, 20)
# df_recos["user_id"] = df_recos["user_id"].map(new_to_old_user_id_mapping)
# df_recos["item_id"] = df_recos["item_id"].map(new_to_old_item_id_mapping)

#save_user_item(df_recos, "submission_itemknn.csv")


  self._set_arrayXarray(i, j, x)


In [18]:
import importlib
import Components.generate_recommendations as gr
importlib.reload(gr)


<module 'Components.generate_recommendations' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\generate_recommendations.py'>

In [None]:
import numpy as np
import scipy.sparse as sp
import torch
from Components.multiVAE import MultiVAE

# ============================================================
# INITIALIZE MODEL
# ============================================================
n_items = X_train_binary.shape[1]
train_user_ids = train_mapped["user_id"].unique()
n_users_train = len(train_user_ids)
X_train_dense = torch.FloatTensor(X_train_binary.toarray())

row_sums = X_train_dense.sum(1, keepdim=True)
X_train_dense = X_train_dense / torch.clamp(row_sums, min=1.0)

p_dims = [600, 200, n_items]
model = MultiVAE(p_dims, dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ============================================================
# TRAINING LOOP
# ============================================================

epochs = 30
batch_size = 2000

total_anneal_steps = 200000   # recommended by the paper
anneal_cap = 1.0              # max value for beta
update_count = 0              # global step counter

for epoch in range(epochs):
    perm = torch.randperm(n_users_train)
    epoch_loss = 0.0

    for start in range(0, n_users_train, batch_size):
        end = start + batch_size
        batch_idx = perm[start:end]
        batch = X_train_dense[batch_idx]

        # ===== KL annealing =====
        if total_anneal_steps > 0:
            beta = min(anneal_cap, update_count / total_anneal_steps)
        else:
            beta = anneal_cap

        logits, mu, logvar = model(batch)
        loss, _, _ = model.loss_function(logits, batch, mu, logvar, beta)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        update_count += 1

    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}")


Epoch 1/30 - Loss: 190.7376 - Beta: 0.0001
Epoch 2/30 - Loss: 155.9342 - Beta: 0.0002
Epoch 3/30 - Loss: 154.8302 - Beta: 0.0004
Epoch 4/30 - Loss: 154.6940 - Beta: 0.0005
Epoch 5/30 - Loss: 154.5784 - Beta: 0.0006
Epoch 6/30 - Loss: 154.3436 - Beta: 0.0007
Epoch 7/30 - Loss: 154.0062 - Beta: 0.0008
Epoch 8/30 - Loss: 153.9642 - Beta: 0.0010
Epoch 9/30 - Loss: 153.7860 - Beta: 0.0011
Epoch 10/30 - Loss: 153.6013 - Beta: 0.0012
Epoch 11/30 - Loss: 153.0719 - Beta: 0.0013
Epoch 12/30 - Loss: 151.9877 - Beta: 0.0014
Epoch 13/30 - Loss: 151.1547 - Beta: 0.0016
Epoch 14/30 - Loss: 150.7140 - Beta: 0.0017
Epoch 15/30 - Loss: 150.5086 - Beta: 0.0018
Epoch 16/30 - Loss: 150.4418 - Beta: 0.0019
Epoch 17/30 - Loss: 150.3630 - Beta: 0.0020
Epoch 18/30 - Loss: 150.3190 - Beta: 0.0022
Epoch 19/30 - Loss: 150.2542 - Beta: 0.0023
Epoch 20/30 - Loss: 150.1953 - Beta: 0.0024
Epoch 21/30 - Loss: 150.0869 - Beta: 0.0025
Epoch 22/30 - Loss: 149.9371 - Beta: 0.0026
Epoch 23/30 - Loss: 149.6850 - Beta: 0.00

In [45]:
import importlib
import Components.generate_recommendations as gr
importlib.reload(gr)


<module 'Components.generate_recommendations' from 'c:\\Users\\matte\\Desktop\\AIProject\\Components\\generate_recommendations.py'>

In [101]:
test_users = np.sort(test_mapped["user_id"].unique())
print(len(test_users))

13579


In [99]:
from Components.generate_recommendations import multivae_recommend, save_submission



test_users = np.sort(test_mapped["user_id"].unique())
n_test_users = len(test_users)

user_to_row = {u: i for i, u in enumerate(test_users)}
index_to_user = {i: u for i, u in enumerate(test_users)}

rows = test_mapped["user_id"].map(user_to_row).values
cols = test_mapped["item_id"].values
data = np.ones(len(test_mapped))

X_test_in_binaryMV = sp.csr_matrix(
    (data, (rows, cols)),
    shape=(n_test_users, num_items)
)

X_dense_test_in = torch.FloatTensor(X_test_in_binaryMV.toarray())

row_sums_test = X_dense_test_in.sum(1, keepdim=True)
X_dense_test_in = X_dense_test_in / torch.clamp(row_sums_test, min=1.0)

known_items = {}

# known_items 
for row in train_mapped.itertuples():
    u = row.user_id
    if u in user_to_row:   
        known_items[user_to_row[u]] = known_items.get(user_to_row[u], set())
        known_items[user_to_row[u]].add(row.item_id)

# known_items
for row in test_mapped.itertuples():
    u = row.user_id
    known_items[user_to_row[u]] = known_items.get(user_to_row[u], set())
    known_items[user_to_row[u]].add(row.item_id)

# convert to lists
known_items = {k: list(v) for k, v in known_items.items()}

rec_df = multivae_recommend(
    model=model,
    X_dense_test_in=X_dense_test_in,
    index_to_user=index_to_user,
    known_items=known_items,
    top_k=20
)


rec_df["user_id"] = rec_df["user_id"].map(new_to_old_user_id_mapping)

# item_id: mapped → old
rec_df["item_id"] = rec_df["item_id"].map(new_to_old_item_id_mapping)

In [97]:
save_submission(rec_df, "submission_multivae.csv")
print("MultiVAE recommendations saved to submission_multivae.csv")

File saved to submission_multivae.csv
MultiVAE recommendations saved to submission_multivae.csv
