In [1]:
import os
import sys
import math
import torch

import numpy as np
import pandas as pd
import torch.nn as nn
acho
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Summary

The most basic Neural Matrix Factorization model possible based on Figure 3 from the original [NCF paper](https://arxiv.org/pdf/1708.05031).

This model is trained on the Movielense dataset where the objective is to predict ratings different users give to movies.

This ultra-simplistic version uses only the user and movie IDs to create learnable embeddings, no additional user or item features are used.

In [14]:
"""
Define the general parameters that will be used.
Seeds are set for reproducibility.
Device is set to select if we will run the model on a GPU or on CPU.
All other parameters are related to the model itseld, it's optimizer and training schema.
"""


np.random.seed(42)
torch.manual_seed(42)

device = 'cuda'

learning_rate = 0.001
weight_decay = 1e-6
batch_size = 2048
epochs = 10
hidden_layers = [32, 32]
embedding_size = 32

In [15]:
"""
Select which dataset to use.
The files in each dataset are stored differently mostly due to data size but also as its structure evolved over the years.
"""

dataset = 'ml-20m'

if dataset == 'ml-100k':
    data = pd.read_csv(f'../data/{dataset}/u.data', sep="\t", header=None)
    data.columns = ['user id', 'movie id', 'rating', 'timestamp']
elif dataset == 'ml-20m':
    data = pd.read_csv(f'../data/{dataset}/ratings.csv',on_bad_lines='skip')
else:
    print(f'{dataset} does nto exist or was incorrectly written')

In [4]:
display(data)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
...,...,...,...,...
20000258,138493,68954,4.5,1258126920
20000259,138493,69526,4.5,1259865108
20000260,138493,69644,3.0,1260209457
20000261,138493,70286,5.0,1258126944


In [5]:
"""
The embedding layers expect indices to start at 0.
Given both user and movie IDs start at 1, we apply label encoder to fix it. 
Subtracting 1 would also solve the problem however that would be problem specific and not a general solution.
"""

user_enc = LabelEncoder()
item_enc = LabelEncoder()
data['user_id_enc'] = user_enc.fit_transform(data['userId'])
data['movie_id_enc'] = item_enc.fit_transform(data['movieId'])

In [6]:
class MLDataset(Dataset):
    """
    A PyTorch Dataset class for loading user-item interaction data for the NCF model.

    This class is designed to handle datasets containing user-item pairs and their associated ratings.
    It converts the input data into a format suitable for PyTorch's data loading utilities.

    Attributes:
    ----------
    user_item_pairs : np.ndarray
        A numpy array containing user-item pairs, where each pair consists of encoded user IDs and movie IDs.
    
    ratings : np.ndarray
        A numpy array containing the normalized ratings (scaled between 0 and 1) corresponding to each user-item pair.
    
    pair_max_ids : np.ndarray
        A numpy array containing the maximum user and movie IDs in the dataset, used for indexing purposes.

    Methods:
    -------
    __getitem__(index):
        Retrieves the user ID, movie ID, and corresponding rating for a specified index.
    
    __len__():
        Returns the total number of ratings in the dataset.

    Parameters:
    ----------
    data : pd.DataFrame
        A pandas DataFrame containing the dataset with at least the following columns 'user_id_enc', 'movie_id_enc', and 'rating'.
        - 'user_id_enc': Encoded user IDs.
        - 'movie_id_enc': Encoded movie IDs.
        - 'rating': Ratings given by users to the movies, expected to be in the range [0, 5].

    Example:
    --------
    >>> dataset = MLDataset(data)
    >>> user_id, movie_id, rating = dataset[0]
    >>> dataset_length = len(dataset)
    """
    
    def __init__(self, data):
        self.user_item_pairs = data[['user_id_enc', 'movie_id_enc']].to_numpy().astype(np.int32)
        self.ratings = (data[['rating']]/5.).to_numpy().astype(np.float32)
        self.pair_max_ids = np.max(self.user_item_pairs, axis=0)

    def __getitem__(self, index):
        return self.user_item_pairs[index][0], self.user_item_pairs[index][1], self.ratings[index]

    def __len__(self):
        return self.ratings.shape[0]

In [7]:
"""
Create the train, validation and test set partitions.
Pytorch's random_split works similarly to scikit train_test_split, but allows for 2 additional important things:
 - Can partition instanciated pytorch Datasets.
 - Can split 3 ways, instead of only 2.
"""


dataset = MLDataset(data)

train_length = int(len(dataset) * 0.7)
valid_length = int(len(dataset) * 0.2)
test_length = len(dataset) - train_length - valid_length

train_dataset, valid_dataset, test_dataset = random_split(dataset, (train_length, valid_length, test_length))

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=7)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=7)
test_data_loader = DataLoader(test_dataset, batch_size=1, num_workers=1)

In [8]:
class NCF(nn.Module):
    """
    Neural Collaborative Filtering (NCF) model.

    This class implements a neural network model for collaborative filtering, combining user and item embeddings 
    with a multi-layer perceptron (MLP) and a Generalized Matrix Factorization (GMF) to predict user-item interactions.
    The model uses embeddings to capture the latent factors of users and items, and then applies both a nonlinear transformation
    through the MLP, and a linear transformation through the GMF.

    Attributes:
    ----------
    user_embedding : nn.Embedding
        An embedding layer for user IDs, mapping each user to a dense vector representation of specified size.
    
    item_embedding : nn.Embedding
        An embedding layer for item IDs, mapping each item to a dense vector representation of specified size.
    
    mlp : nn.ModuleList
        A list of sequential layers forming the multi-layer perceptron, which processes the concatenated embeddings 
        to learn non-linear interactions between users and items.
    
    output : nn.Linear
        A linear layer that produces the final prediction score for user-item interactions.

    Parameters:
    ----------
    num_users : int
        The total number of unique users in the dataset.
    
    num_items : int
        The total number of unique items in the dataset.
    
    embedding_size : int, optional
        The size of the user and item embeddings (default is 32).
    
    hidden_layers : list of int, optional
        A list specifying the number of neurons in each hidden layer of the MLP (default is [32, 32]).

    Methods:
    -------
    forward(user_input, item_input):
        Defines the forward pass of the model, computing the prediction for the given user and item inputs.

    Example:
    --------
    >>> model = NCF(num_users=1000, num_items=500, embedding_size=32, hidden_layers=[64, 32])
    >>> user_input = torch.tensor([0, 1, 2])
    >>> item_input = torch.tensor([5, 6, 7])
    >>> predictions = model(user_input, item_input)
    >>> print(predictions.shape)  # Output: torch.Size([3, 1])
    """
    def __init__(self, num_users, num_items, embedding_size=32, hidden_layers=[32,32]):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)

        input_dim = 2 * embedding_size
        self.mlp = nn.ModuleList()
        for hidden_dim in hidden_layers:
            self.mlp.append(
                nn.Sequential(
                    nn.Linear(input_dim, hidden_dim),
                    nn.BatchNorm1d(hidden_dim),
                    nn.ReLU(),
                    nn.Dropout(p=0.2)
                )
            )
            input_dim = hidden_dim

        self.output = nn.Linear(2 * embedding_size, 1)
        
    def forward(self, user_input, item_input):
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        gmf = user_embedded * item_embedded
        x = torch.cat([user_embedded, item_embedded], dim=-1)
        for mlp_layer in self.mlp:
            x = mlp_layer(x)
        x = torch.cat([gmf, x], dim=-1)
        prediction = self.output(x)
        return prediction

In [9]:
def train(model, optimizer, data_loader, criterion, device, log_interval=100):
    """
    Train the given model using the provided data loader and optimization parameters.

    This function performs one epoch of training for the specified model, iterating over the data loader to 
    retrieve user-item interactions and their corresponding ratings. It computes the loss using the specified 
    criterion, performs backpropagation, and updates the model parameters using the optimizer.

    Parameters:
    ----------
    model : nn.Module
        The neural network model to be trained.

    optimizer : torch.optim.Optimizer
        The optimizer used.

    data_loader : DataLoader
        A PyTorch DataLoader that provides batches of user-item interactions and ratings for training.

    criterion : callable
        A loss function.

    device : torch.device
        The device on which the model and data should be processed.

    log_interval : int, optional
        The number of steps after which to log the average loss.

    Returns:
    -------
    float
        The average training loss over the epoch.

    Example:
    --------
    >>> avg_loss = train(model, optimizer, train_loader, criterion, device)
    >>> print(f"Average Training Loss: {avg_loss:.4f}")
    """
    model.train()
    total_loss = 0
    av_loss = []
    train_pbar = tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (user, item, ratings) in enumerate(train_pbar):
        user, item, ratings = user.to(device), item.to(device), ratings.to(device)
        y = model(user, item)
        loss = criterion(y, ratings.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if (i + 1) % log_interval == 0:
            loss = total_loss / log_interval
            av_loss.append(loss)
            train_pbar.set_postfix(loss=loss)
            total_loss = 0
    return np.mean(av_loss) 

In [10]:
def test(model, data_loader, device):
    """
    Evaluate the performance of the given model.

    This function sets the model to evaluation mode and computes the mean squared error over the provided dataloader.

    Parameters:
    ----------
    model : nn.Module
        The neural network model to be evaluated.

    data_loader : DataLoader
        A PyTorch DataLoader.

    device : torch.device
        The device (CPU or GPU) on which the model and data should be processed.

    Returns:
    -------
    float
        The mean squared error (MSE) between the predicted ratings and the actual ratings, scaled by a factor of 5.

    Example:
    --------
    >>> mse = test(model, test_loader, device)
    >>> print(f"Mean Squared Error: {mse:.4f}")
    """
    model.eval()
    targets, predicts = list(), list()
    with torch.no_grad():
        for user, item, ratings in tqdm(data_loader, smoothing=0, mininterval=1.0):
            user, item, ratings = user.to(device), item.to(device), ratings.to(device)
            y = model(user,item)
            targets.extend(ratings.tolist())
            predicts.extend(y.tolist())
    return 5.* mean_squared_error(targets, predicts)

In [11]:
"""
Here we instantiate the model, as well as it's optimizer and loss function.
"""

model = NCF(dataset.pair_max_ids[0]+1, dataset.pair_max_ids[1]+1, embedding_size, hidden_layers).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [12]:
# Helper to calculate the number of training steps, usefull for smaller datasets like ml-100k

n_steps = math.ceil(train_length / batch_size)
n_steps

6837

In [13]:
"""
Here we train and evaluate the model.
"""

metric_values = []
loss_values = []
for epoch_i in range(epochs):
    loss = train(model, optimizer, train_data_loader, criterion, device, log_interval=100)
    loss_values.append((epoch_i, loss))
    metric_train = test(model, train_data_loader, device)
    metric_valid = test(model, valid_data_loader, device)
    print('epoch:', epoch_i + 1, 'train: mae:', metric_train)
    print('epoch:', epoch_i + 1, 'validation: mae:', metric_valid)
    
    
    metric_values.append((epoch_i, metric_train, metric_valid))

100%|██████████████████████████| 6837/6837 [00:37<00:00, 184.48it/s, loss=0.032]
100%|██████████████████████████████████████| 6837/6837 [00:36<00:00, 188.78it/s]
100%|██████████████████████████████████████| 1954/1954 [00:10<00:00, 189.44it/s]


epoch: 1 train: mae: 0.15454255308172904
epoch: 1 validation: mae: 0.1567457369901873


100%|█████████████████████████| 6837/6837 [00:35<00:00, 194.78it/s, loss=0.0299]
100%|██████████████████████████████████████| 6837/6837 [00:33<00:00, 205.62it/s]
100%|██████████████████████████████████████| 1954/1954 [00:10<00:00, 181.55it/s]


epoch: 2 train: mae: 0.14163317444487977
epoch: 2 validation: mae: 0.1472045720218037


100%|██████████████████████████| 6837/6837 [00:35<00:00, 194.78it/s, loss=0.028]
100%|██████████████████████████████████████| 6837/6837 [00:34<00:00, 199.81it/s]
100%|██████████████████████████████████████| 1954/1954 [00:08<00:00, 225.33it/s]


epoch: 3 train: mae: 0.13164767591907592
epoch: 3 validation: mae: 0.13986598320870733


100%|█████████████████████████| 6837/6837 [00:33<00:00, 202.52it/s, loss=0.0266]
100%|██████████████████████████████████████| 6837/6837 [00:34<00:00, 198.35it/s]
100%|██████████████████████████████████████| 1954/1954 [00:08<00:00, 242.47it/s]


epoch: 4 train: mae: 0.12261401002467788
epoch: 4 validation: mae: 0.13347205832157105


100%|█████████████████████████| 6837/6837 [00:34<00:00, 198.76it/s, loss=0.0258]
100%|██████████████████████████████████████| 6837/6837 [00:34<00:00, 198.02it/s]
100%|██████████████████████████████████████| 1954/1954 [00:08<00:00, 226.15it/s]


epoch: 5 train: mae: 0.117443961597783
epoch: 5 validation: mae: 0.13040092285196567


100%|█████████████████████████| 6837/6837 [00:34<00:00, 196.67it/s, loss=0.0255]
100%|██████████████████████████████████████| 6837/6837 [00:32<00:00, 212.93it/s]
100%|██████████████████████████████████████| 1954/1954 [00:09<00:00, 214.14it/s]


epoch: 6 train: mae: 0.11447731381725743
epoch: 6 validation: mae: 0.12885508561116038


100%|█████████████████████████| 6837/6837 [00:34<00:00, 197.23it/s, loss=0.0253]
100%|██████████████████████████████████████| 6837/6837 [00:33<00:00, 201.80it/s]
100%|██████████████████████████████████████| 1954/1954 [00:11<00:00, 176.75it/s]


epoch: 7 train: mae: 0.1132725117678936
epoch: 7 validation: mae: 0.12839608234650082


100%|█████████████████████████| 6837/6837 [00:34<00:00, 196.70it/s, loss=0.0252]
100%|██████████████████████████████████████| 6837/6837 [00:33<00:00, 201.71it/s]
100%|██████████████████████████████████████| 1954/1954 [00:08<00:00, 223.90it/s]


epoch: 8 train: mae: 0.11283024448968164
epoch: 8 validation: mae: 0.128229469639282


100%|█████████████████████████| 6837/6837 [00:33<00:00, 203.84it/s, loss=0.0252]
100%|██████████████████████████████████████| 6837/6837 [00:32<00:00, 209.17it/s]
100%|██████████████████████████████████████| 1954/1954 [00:13<00:00, 141.35it/s]


epoch: 9 train: mae: 0.11260593912765203
epoch: 9 validation: mae: 0.1281538413914149


100%|█████████████████████████| 6837/6837 [00:33<00:00, 201.89it/s, loss=0.0252]
100%|██████████████████████████████████████| 6837/6837 [00:34<00:00, 198.59it/s]
100%|██████████████████████████████████████| 1954/1954 [00:09<00:00, 208.66it/s]


epoch: 10 train: mae: 0.11240976340451683
epoch: 10 validation: mae: 0.12803944921589847
