In [1]:
from models.training import GMFTrainer
from models.base_model import GMF
from evaluation.evaluator import Evaluator
import numpy as np
import pandas as pd
import torch

In [2]:
sessions_train_df = pd.read_json('data_files/train_sessions.jsonl', lines=True)
tracks_df = pd.read_json('data_files/tracks.jsonl', lines=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = torch.utils.data.TensorDataset(
    torch.LongTensor(sessions_train_df['user_id'].values).to(device),
    torch.LongTensor(sessions_train_df['track_id'].values).to(device),
    torch.FloatTensor(sessions_train_df['score'].values).to(device)
)

train_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=128,
    shuffle=True
)

unique_users_count = sessions_train_df['user_id'].nunique()
unique_tracks_count = len(tracks_df)

In [3]:
gmf = GMF(unique_users_count + 1, unique_tracks_count + 1, 32)

In [None]:
trainer = GMFTrainer(gmf, train_loader, device)
trainer.train_with_weight_decay(
    epochs=3,
    learning_rate=0.01,
    weight_decay=0.15
)

Epoch 1/3 - Average Loss: 1.6307
Epoch 2/3 - Average Loss: 0.8296


In [None]:
user_norms = torch.norm(gmf.user_embedding.weight, dim=1)
avg_user_norm = torch.mean(user_norms) 
item_norms = torch.norm(gmf.item_embedding.weight, dim=1)
avg_item_norm = torch.mean(item_norms)

print("Średnia norma user embeddings:", avg_user_norm.item())
print("Średnia norma item embeddings:", avg_item_norm.item())

In [None]:
ground_truth_df = pd.read_json('data_files/val_sessions.jsonl', lines=True)
ground_truth_df = ground_truth_df[ground_truth_df['user_id'] < 1000]
all_users = torch.arange(1000).to(device)
all_items = torch.arange(unique_tracks_count).to(device)
all_pairs = torch.cartesian_prod(all_users, all_items)
user_input = all_pairs[:,0]
item_input = all_pairs[:,1]

In [None]:
predictions = []
batch_size = 10000
gmf.eval()
for i in range(0, len(all_pairs), batch_size):
    batch_pairs = all_pairs[i:i + batch_size]
    batch_user_input = batch_pairs[:,0].to(device)
    batch_item_input = batch_pairs[:,1].to(device)
    
    with torch.no_grad():
        batch_predictions = gmf.forward(batch_user_input, batch_item_input)
        predictions.append(batch_predictions)

all_predictions = torch.cat(predictions)

In [None]:
all_pairs_np = all_pairs.cpu().numpy()
all_predictions_np = all_predictions.cpu().numpy()

recommendations_df = pd.DataFrame({
    'user_id': all_pairs_np[:, 0],
    'track_id': all_pairs_np[:, 1],
    'score': all_predictions_np
})

In [None]:
evaluator2 = Evaluator([5, 10, 15, 20, 25], 1.3)

In [None]:
metrics = evaluator2.evaluate(recommendations_df, ground_truth_df)

In [None]:
for metric, value in metrics.items():
    print(f'{metric}: {value * 100}%')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.histplot(data=recommendations_df, x='score', bins=50)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.histplot(data=ground_truth_df, x='score', bins=50)
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.5))
plt.yticks(range(0, 60000, 2000))
plt.show()

In [None]:
merged_df = recommendations_df.merge(
    ground_truth_df,
    on=['user_id', 'track_id'],
    how='right',
    suffixes=('_pred', '')
).fillna({'score': 0.0})

plt.figure(figsize=(10, 6))
sns.histplot(data=merged_df, x='score', bins=300)
plt.grid(True, alpha=0.3)
plt.show()


relevant_stats = merged_df.groupby('user_id').agg({
    'score': [
        ('relevant_items_count', lambda x: (x > 1.3).sum()),
        ('total_items', 'count'),
        ('relevant_percentage', lambda x: (x > 1.3).mean() * 100)
    ]
})

print(relevant_stats)
print('relevant_percentage_mean: ' + str(relevant_stats[('score', 'relevant_percentage')].mean()))
print('total_items_mean: ' + str(relevant_stats[('score', 'total_items')].mean()))