# Projet IA Frameworks 2023 - Partie 3
@nestorhabibi @julien-blanchon @XuanMinhVuongNGUYEN

## Partie 0 : Librairies, Données

In [21]:
# deep learning
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms

# data
import pandas as pd
import numpy as np

# random
import random

# os
import os

# typing
from typing_extensions import override

In [22]:
def seed_everything(seed_value: int):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # Numpy
    torch.manual_seed(seed_value) # PyTorch
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

seed_everything(42)

In [23]:
# chargement des données
interactions_test = pd.read_csv('../data/interactions_test.csv')
interactions_train = pd.read_csv('../data/interactions_train.csv')
RAW_interactions = pd.read_csv('../data/RAW_interactions.csv')
RAW_recipes = pd.read_csv('../data/RAW_recipes.csv')
test_script = pd.read_csv('../data/test_script.csv')

In [24]:
# get user id and ratings
df_interactions_train = interactions_train[['u', 'i', 'rating']].to_numpy(dtype=np.int64)

# split train into train and validation
# df_interactions_train, df_interactions_val = torch.utils.data.random_split(df_interactions_train, [0.80, 0.20])
df_interactions_train, df_interactions_val = torch.utils.data.random_split(df_interactions_train, [0.99, 0.01])

In [25]:
# No overlap :(, the test set is broke
# # Get the interactions_test with the u and i present in interactions_train
# interactions_test_split = interactions_test[interactions_test["u"].isin(interactions_train["u"].values) & interactions_test["i"].isin(interactions_train["i"].values)]

# df_interactions_test = interactions_test_split[['u', 'i', 'rating']].to_numpy(dtype=np.int64)

In [26]:
# create dataloaders
trainloader = torch.utils.data.DataLoader(
    df_interactions_train, 
    batch_size=512*4, 
    shuffle=True, 
    num_workers=2
)

valloader = torch.utils.data.DataLoader(
    df_interactions_val, 
    batch_size=512*4, 
    num_workers=2
)

# testloader = torch.utils.data.DataLoader(
#     df_interactions_test, 
#     batch_size=64*4, 
#     num_workers=2
# )

## Partie 1 : Import de la classe NCF

In [30]:
class NCF(nn.Module):
    """Neural Collaborative Filtering (NCF)

    Reference: 
    ----------
    @article{he2017neural,
        title     = {Neural Collaborative Filtering},
        author    = {Xiangnan He and Lizi Liao and Hanwang Zhang and Liqiang Nie and Xia Hu and Tat-Seng Chua},
        journal   = {The Web Conference},
        year      = {2017},
        doi       = {10.1145/3038912.3052569},
        bibSource = {Semantic Scholar https://www.semanticscholar.org/paper/ad42c33c299ef1c53dfd4697e3f7f98ed0ca31dd}
    }
    """
    def __init__(self, n_users: int, n_items: int, n_factors: int = 8, dropout: float = 0.20) -> None:
        """Neural Collaborative Filtering (NCF)

        Parameters
        ----------
        n_users : int
            Number of users for the embeddings layer.
        n_items : int
            Number of items for the embeddings layer.
        n_factors : int, optional
            Embeddings layer size, by default 8
        dropout : float, optional
            Dropout rate, by default 0.20
        """
        super().__init__()
        # Embedding layers
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors)

        # MLP layers
        self.predictor = torch.nn.Sequential(
            nn.Linear(in_features=n_factors*2 , out_features=64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=64, out_features=32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(in_features=32, out_features=1),
            nn.Sigmoid()
        )

    @override
    def forward(self, user: torch.tensor, item: torch.tensor) -> torch.Tensor:
        """Forward pass

        Parameters
        ----------
        user : torch.tensor
            User ids
        item : torch.tensor
            Item ids

        Returns
        -------
        torch.Tensor
            Predictions
        """ 
        # Pass through embedding layers
        user_emb = self.user_embeddings(user)
        item_emb = self.item_embeddings(item)

        # Concat the two embeddings
        z = torch.cat([user_emb, item_emb], dim=-1)

        # Pass through MLP
        y = self.predictor(z)
        return y

## Partie 2 : Entraînement de NCF sur les données

In [28]:
# Set device
if ((int(torch.__version__.split(".")[0]) >= 2) or (int(torch.__version__.split(".")[1]) >= 13)) and torch.has_mps:
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Device set to: {device}")

Device set to: cuda


In [29]:
from tqdm import tqdm
import torch
import torch.nn as nn

def train(
        model: NCF, 
        optimizer: torch.optim.Optimizer, 
        trainloader: torch.utils.data.DataLoader, 
        valloader: torch.utils.data.DataLoader,
        epochs: int = 30
    ) -> None:
    """Train the model

    Parameters
    ----------
    model : NCF
        The NCF model to train
    optimizer : torch.optim.Optimizer
        The optimizer to use
    trainloader : torch.utils.data.DataLoader
        The train dataloader
    valloader : torch.utils.data.DataLoader
        The validation dataloader
    epochs : int, optional
        Number of epochs to train, by default 30
    """ 
    criterion_train = nn.MSELoss().to(device)
    criterion_val = nn.L1Loss(reduction='mean').to(device)

    for epoch in range(epochs):
        model.train()
        # initialize metrics
        train_loss = []

        for data in (pbar := tqdm(trainloader, unit=" batch", desc=f"Train {epoch:03}")):
            data = data.to(device)
            # get the data
            users = data[:, 0]
            items = data[:, 1]
            ratings = data[:, 2]
            # normalize the ratings
            ratings = (ratings / 5)

            # zero the parameter gradients
            optimizer.zero_grad()
            
            # forward pass
            y_hat = model(users, items)

            # compute loss
            loss = criterion_train(y_hat.flatten(), ratings)

            # backward pass + optimize
            loss.backward()
            optimizer.step()

            # update metrics
            train_loss.append(loss.item())

            # update progress bar
            pbar.set_postfix_str(f"MSE train {5*loss.item():.3f}")

        # Evaluate the model on the val set
        
        model.eval() 
        valid_loss = []
        for data in (pbar := tqdm(valloader, unit=" batch", desc=f"Valid {epoch:03}")):
            # get the data
            users = data[:, 0].to(torch.int).to(device)
            items = data[:, 1].to(torch.int).to(device)
            ratings = data[:, 2].to(torch.int).to(device)

            # normalize the ratings
            ratings = (ratings / 5)
            with torch.no_grad():
                y_hat = model(users, items)
                # compute loss
                loss = criterion_val(y_hat.flatten(), ratings)
                valid_loss.append(loss.item())
            
            # update pbar
            pbar.set_postfix_str(f"MAE valid {5*loss.item():.3f}")
            
    print("Final validation MAE:", np.mean(valid_loss)*5)


In [14]:
# get number of unique user ids and ratings
n_user = interactions_train['u'].max()+2
n_items = interactions_train['i'].max()+2

# define model
model = NCF(n_user, n_items, n_factors=16).to(device)

# define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [15]:
# train the model
train(
    model=model, 
    optimizer=optimizer, 
    trainloader=trainloader, 
    valloader=valloader,
    epochs=5
)

Train 000: 100%|██████████| 338/338 [00:01<00:00, 170.19 batch/s, MSE train 0.167]
Valid 000: 100%|██████████| 4/4 [00:00<00:00, 21.80 batch/s, MAE valid 0.553]
Train 001: 100%|██████████| 338/338 [00:01<00:00, 246.93 batch/s, MSE train 0.183]
Valid 001: 100%|██████████| 4/4 [00:00<00:00, 20.62 batch/s, MAE valid 0.554]
Train 002: 100%|██████████| 338/338 [00:01<00:00, 241.03 batch/s, MSE train 0.176]
Valid 002: 100%|██████████| 4/4 [00:00<00:00, 21.06 batch/s, MAE valid 0.553]
Train 003: 100%|██████████| 338/338 [00:01<00:00, 252.44 batch/s, MSE train 0.171]
Valid 003: 100%|██████████| 4/4 [00:00<00:00, 20.49 batch/s, MAE valid 0.545]
Train 004: 100%|██████████| 338/338 [00:01<00:00, 246.21 batch/s, MSE train 0.160]
Valid 004: 100%|██████████| 4/4 [00:00<00:00, 21.01 batch/s, MAE valid 0.534]

Final validation MAE: 0.5583174712955952





In [92]:
# def test(
#         model: NCF, 
#         testloader: DataLoader
#     ):
#     model.eval()
#     running_mae = 0
#     with torch.no_grad():
#         corrects = 0
#         total = 0
#         for data in (pbar := tqdm(testloader, total=len(testloader), unit=" batch", desc=f"Test")):
#             # get the data
#             users = data[:, 0].to(torch.int).to(device)
#             items = data[:, 1].to(torch.int).to(device)
#             r = data[:, 2].to(torch.int).to(device)

#             r = (r / 5)
#             y_hat = model(users, items).flatten()
#             error = torch.abs(y_hat - r).sum().data
            
#             running_mae += error
#             total += r.size(0)
    
#     mae = running_mae/total
#     return (mae * 5).item()

In [93]:
# test(model, testloader)

In [16]:
# save weights of the model
torch.save(model.state_dict(), './weights.pth')

In [17]:
# Load the model
model = NCF(n_users=25077, n_items=178264, n_factors=16).to(device)
model.load_state_dict(torch.load('./weights.pth'))

<All keys matched successfully>

In [18]:
X_u = torch.from_numpy(interactions_train['u'].to_numpy()).to(device)
X_i = torch.from_numpy(interactions_train['i'].to_numpy()).to(device)
y = torch.from_numpy(interactions_train['rating'].to_numpy()).to(device)

In [19]:
y_predict = model(X_u, X_i)*5

In [20]:
# Compute the MAE between the predicted ratings and the true ratings on the 15k first samples
mae = torch.abs(y_predict[:15000] - y[:15000]).mean()
mae

tensor(0.6760, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)