In [2]:
import torch
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%load_ext autoreload
%autoreload 2

## Preparing the Test data

In [3]:
# from torch.utils.data import DataLoader, TensorDataset, random_split
# import pandas as pd

# # URL for MovieLens 1M dataset (you can change the URL for other versions)
# url = '../data/ml-1m/ratings.dat'

# # Load the dataset directly into a pandas DataFrame
# df = pd.read_csv(url, sep='::', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
# # Display the first few rows to confirm the dataset loaded correctly
# print(df.head())
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd

# URL for MovieLens 100k dataset (you can change the URL for other versions)
url = 'https://files.grouplens.org/datasets/movielens/ml-100k/u.data'

# Load the dataset directly into a pandas DataFrame
df = pd.read_csv(url, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

print(df.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


## Mapping the value

In [4]:
# Take the unique value and mapping it
user_mapping = {user_id: idx for idx, user_id in enumerate(df['user_id'].unique())} 
item_mapping = {item_id: idx for idx, item_id in enumerate(df['item_id'].unique())}

# Applying the mappings
df['user_id'] = df['user_id'].map(user_mapping)
df['item_id'] = df['item_id'].map(item_mapping)

print(df.head())

   user_id  item_id  rating  timestamp
0        0        0       3  881250949
1        1        1       3  891717742
2        2        2       1  878887116
3        3        3       2  880606923
4        4        4       1  886397596


## Split the Data

In [5]:
from BPRDataset import BPRDataset

# Convert from DataFrame into PyTorch tensor
user_tensor = torch.tensor(df['user_id'].values,dtype=torch.long)
item_tensor = torch.tensor(df['item_id'].values,dtype=torch.long)
rating_tensor = torch.tensor(df['rating'].values,dtype=torch.float)

num_items = df['item_id'].max() + 1

# Create a dataset
dataset = BPRDataset(df, num_items= num_items, min_rating=2.0)

# Split the dataset into 70% training, 15% validation, and 15% test
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_data, val_data, test_data = random_split(dataset, [train_size, val_size, test_size])

# Create a dataLoader for training, validation, and testing
batch_size = 100
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

## Train the model

In [6]:
from bpr_mf import MF

model = MF(num_users=len(user_mapping)
           , num_items= len(item_mapping)
           , num_factors=30
           , device=device)

model.to(device)

train_losses, val_losses = model.train_model_bpr(train_loader, val_loader,
                                         num_epochs=10,num_step=10,lr=0.002, reg=1e-5)


Epoch 1/10, Train Loss: 0.4842, Val Loss: 0.3268, Time: 7.59s
Epoch 2/10, Train Loss: 0.2971, Val Loss: 0.2881, Time: 7.33s
Epoch 3/10, Train Loss: 0.2564, Val Loss: 0.2521, Time: 7.86s
Epoch 4/10, Train Loss: 0.2262, Val Loss: 0.2312, Time: 8.49s
Epoch 5/10, Train Loss: 0.2069, Val Loss: 0.2225, Time: 7.84s
Epoch 6/10, Train Loss: 0.1975, Val Loss: 0.2195, Time: 8.74s
Epoch 7/10, Train Loss: 0.1881, Val Loss: 0.2136, Time: 7.99s
Epoch 8/10, Train Loss: 0.1746, Val Loss: 0.2083, Time: 8.97s
Epoch 9/10, Train Loss: 0.1663, Val Loss: 0.2028, Time: 8.57s
Epoch 10/10, Train Loss: 0.1594, Val Loss: 0.1983, Time: 9.85s


## Evaluate by AUC

In [12]:
from BPRDataset import BPREvalDataset
from eval import evaluate_auc

# Recover test DataFrame using the indices from random_split:
test_df = df.iloc[test_data.indices]

# Instantiate evaluation dataset with multiple negatives
eval_dataset = BPREvalDataset(test_df, num_items=num_items, num_negatives=100, min_rating=2.0)

# # Convert evaluation dataset into a list of tuples (user, pos_item, neg_item)
eval_list = [
    (user.item(), pos_item.item(), [neg.item() for neg in neg_items])
    for user, pos_item, neg_items in eval_dataset
]

# First 3 tuples
print(eval_list[:3])

auc_score = evaluate_auc(model, eval_list)
print(auc_score)


[(739, 1225, [771, 262, 364, 595, 662, 228, 767, 957, 56, 1511, 1229, 1263, 1254, 983, 1353, 807, 1371, 559, 1677, 425, 605, 444, 712, 776, 1192, 803, 466, 273, 250, 641, 1469, 4, 624, 40, 78, 32, 1669, 488, 77, 137, 426, 33, 487, 569, 1649, 809, 1359, 1115, 892, 346, 1558, 162, 1671, 1007, 567, 658, 569, 936, 1064, 1304, 1094, 752, 1216, 1543, 1208, 89, 1539, 641, 196, 461, 1197, 1044, 1348, 1678, 95, 177, 1183, 1109, 1032, 113, 793, 1442, 1617, 203, 1195, 274, 124, 468, 19, 715, 370, 1157, 216, 267, 1108, 267, 1109, 744, 935, 437]), (713, 420, [558, 159, 1064, 1338, 946, 256, 149, 1048, 289, 20, 325, 660, 368, 167, 486, 95, 800, 960, 55, 1052, 301, 190, 1622, 1303, 1134, 102, 40, 195, 1645, 1065, 381, 711, 37, 1337, 708, 1631, 1549, 953, 944, 935, 1421, 994, 1368, 598, 1563, 1217, 456, 865, 598, 632, 340, 490, 1213, 804, 474, 656, 1656, 232, 1651, 1514, 1119, 803, 145, 645, 1256, 488, 858, 508, 1157, 725, 798, 1117, 443, 1183, 503, 1088, 97, 1453, 482, 534, 820, 532, 807, 761, 1441, 