In [1]:
import sys
sys.path.append("../")

from utils.models_all import GMF, MLP, NeuFM
from utils.dataset import RatingsDatasetAllData
from torch.utils.data import DataLoader

from tqdm.notebook import tqdm

import math

import os

import pandas as pd

import numpy as np

In [3]:
users = 3974
movies = 3564

users = 3974

genres_map = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

sex_map = {
    "M": 0,
    "F": 1,
}

age_map = {
    1: 0,
    18: 1,
    25: 2,
    35: 3,
    45: 4,
    50: 5,
    56: 6,
}

occupation_map = {
    o: o for o in range(21)
}

train_dataloader = DataLoader(
    RatingsDatasetAllData(
        users,
        movies,
        "../train_data/train.csv",
        "../train_data/movies_data.csv",
        "../train_data/users_data.csv",
        "user_id",
        "movie_id",
        "rating",
        "genres",
        genres_map,
        "gender",
        sex_map,
        "age",
        age_map,
        "occupation",
        occupation_map,
    ), 
    batch_size=256,
    num_workers=8,
    shuffle=True,
)
val_dataloader = DataLoader(
    RatingsDatasetAllData(
        users,
        movies,
        "../train_data/validation.csv",
        "../train_data/movies_data.csv",
        "../train_data/users_data.csv",
        "user_id",
        "movie_id",
        "rating",
        "genres",
        genres_map,
        "gender",
        sex_map,
        "age",
        age_map,
        "occupation",
        occupation_map,
    ),
    batch_size=1024,
    num_workers=4,
    shuffle=True,
)

In [4]:
import torch
import torch.nn as nn
import torch.optim as opt

In [5]:
def define_model(k_gmf, k_mlp, layer_sizes, n_genres=18, n_ages=7, n_occupations=21, alpha=0.5):
    return NeuFM(
        GMF(users, movies, n_genres, n_ages, n_occupations, k_gmf),
        MLP(users, movies, n_genres, n_ages, n_occupations, k_mlp, layer_sizes),
        alpha=alpha,
    ).cuda()

In [6]:
def train(model, lr, epochs, train_dataloader, val_dataloader=None, show_loss=True):
    criterion = nn.MSELoss().cuda()
    optimizer = opt.Adam(model.parameters(), lr=lr)
    
    avg_losses = []
    
    model_state = None
    best_epoch = 0
    prev_val_loss = math.inf
    for epoch in tqdm(range(epochs)):
        n_batches = len(train_dataloader)
        avg_loss = 0
        val_loss = 0

        # Train step
        for i_batch, (vus, vis, rs, gs, sxs, ags, ocs) in enumerate(train_dataloader):
            vus = vus.cuda()
            vis = vis.cuda()
            rs = rs.cuda()
            gs = gs.cuda()
            sxs = sxs.cuda()
            ags = ags.cuda()
            ocs = ocs.cuda()

            optimizer.zero_grad()
            y_hat = model(vus, vis, gs, sxs, ags, ocs)

            loss = criterion(y_hat, rs)
            loss.backward()
            optimizer.step()

            avg_loss += math.sqrt(float(loss.detach().cpu()))
        
        avg_loss /= n_batches
        
        # Val step
        if val_dataloader is not None:
            with torch.no_grad():
                for val_vus, val_vis, val_rs, val_gs, val_sxs, val_ags, val_ocs in val_dataloader:
                    val_vus = val_vus.cuda()
                    val_vis = val_vis.cuda()
                    val_rs = val_rs.cuda()
                    val_gs = val_gs.cuda()
                    val_sxs = val_sxs.cuda()
                    val_ags = val_ags.cuda()
                    val_ocs = val_ocs.cuda()

                    val_pred = model(val_vus, val_vis, val_gs, val_sxs, val_ags, val_ocs)
                    val_loss += math.sqrt(float(criterion(val_pred, val_rs).detach().cpu()))

            val_loss /= len(val_dataloader)
            if show_loss:
                print(f"epoch: {epoch+1}, train_loss: {avg_loss}, val_loss: {val_loss}")
            
            avg_losses.append([avg_loss, val_loss])
        else:
            val_loss = avg_loss
            if show_loss:
                print(f"epoch: {epoch+1}, train_loss: {avg_loss}")
            
            avg_losses.append([avg_loss])
                  
        if val_loss < prev_val_loss:
            prev_val_loss = val_loss
            model_state = model.state_dict()
            best_epoch = epoch
        
    return best_epoch, prev_val_loss, model_state, avg_losses

In [7]:
def fit(
    train_dataloader, val_dataloader,
    k_gmf, k_mlp, layer_sizes, alpha=0.5, lr=0.0005, epochs=40, 
    weight_path="/home/", run_number=1, random_state=None, show_loss=True
):
    if random_state is not None:
        torch.manual_seed(random_state)
        
    model = define_model(k_gmf, k_mlp, layer_sizes, alpha=alpha)
    
    best_epoch, val_loss, model_state, losses = train(
        model, lr, epochs, train_dataloader, val_dataloader, show_loss
    )
    
    run_path = f"{weight_path}/run_{run_number}"
    if not os.path.isdir(run_path):
        os.makedirs(run_path)
    
    if val_dataloader is None:
        torch.save(
            model_state, 
            f"{run_path}/final-{best_epoch+1}-{val_loss}.pt"
        )
    else:
        torch.save(
            model_state, 
            f"{run_path}/{best_epoch+1}-{val_loss}.pt"
        )
        
    return np.asarray(losses)

In [8]:
def predict(weight_path, k_gmf, k_mlp, layer_sizes, alpha, out_path, out_name):
    trained_model = define_model(k_gmf, k_mlp, layer_sizes, alpha=alpha)
    
    trained_model.load_state_dict(torch.load(weight_path))
    trained_model.cuda().eval()
    
    test_dataloader = DataLoader(
        RatingsDatasetAllData(
            users,
            movies,
            "../train_data/test.csv",
            "../train_data/movies_data.csv",
            "../train_data/users_data.csv",
            "user_id",
            "movie_id",
            None,
            "genres",
            genres_map,
            "gender",
            sex_map,
            "age",
            age_map,
            "occupation",
            occupation_map,
        ), 
        batch_size=1024,
        num_workers=12
    )
    
    test_predictions = []

    for vus, vis, gs, sxs, ags, ocs in test_dataloader:
        vus = vus.cuda()
        vis = vis.cuda()
        gs = gs.cuda()
        sxs = sxs.cuda()
        ags = ags.cuda()
        ocs = ocs.cuda()

        pred = torch.clip(trained_model(vus, vis, gs, sxs, ags, ocs), 1, 5).cpu().ravel().tolist()
        test_predictions += pred
        
    test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")
    
    out_df = pd.DataFrame.from_dict(
        {
            "id": list(test_csv["id"]),
            "rating": test_predictions
        }
    )

    out_df.to_csv(f"{out_path}/{out_name}.csv", index=False)
    out_df.head()

## Train GMF

In [8]:
losses_fit = []

for run_number, (k_gmf, k_mlp, layers, lr, epochs) in tqdm(enumerate([
    (8, 16, [16, 8], 0.0001, 75),
    (8, 16, [16, 8], 0.0006, 50),
    (8, 16, [16, 8], 0.0008, 50),
    (16, 16, [16, 8], 0.001, 50),
    (16, 32, [32, 16, 8], 0.001, 50),
    (32, 16, [16, 8], 0.0008, 30),
    (32, 16, [16, 8], 0.005, 50), #*
    (32, 16, [16, 8], 0.01, 50),
    (32, 32, [32, 16, 8], 0.01, 50),
    (32, 32, [32, 16, 8], 0.01, 100),
])):
    print(f"{k_gmf=} {k_mlp=} {layers=} {lr=} {epochs=}")
    losses_fit.append(
        fit(
            train_dataloader, val_dataloader,
            k_gmf,
            k_mlp,
            layers,
            0.5,
            lr,
            epochs,
            weight_path="/home/nubol23/Documents/NCF_all_data_2",
            run_number=run_number,
            random_state=2,
            show_loss=False,
        )
    )

0it [00:00, ?it/s]

k_gmf=8 k_mlp=16 layers=[16, 8] lr=0.0001 epochs=75


  0%|          | 0/75 [00:00<?, ?it/s]

k_gmf=8 k_mlp=16 layers=[16, 8] lr=0.0006 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=8 k_mlp=16 layers=[16, 8] lr=0.0008 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=16 k_mlp=16 layers=[16, 8] lr=0.001 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=16 k_mlp=32 layers=[32, 16, 8] lr=0.001 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=32 k_mlp=16 layers=[16, 8] lr=0.0008 epochs=30


  0%|          | 0/30 [00:00<?, ?it/s]

k_gmf=32 k_mlp=16 layers=[16, 8] lr=0.005 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=32 k_mlp=16 layers=[16, 8] lr=0.01 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=32 k_mlp=32 layers=[32, 16, 8] lr=0.01 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

k_gmf=32 k_mlp=32 layers=[32, 16, 8] lr=0.01 epochs=100


  0%|          | 0/100 [00:00<?, ?it/s]

## Total fit

In [9]:
total_dataloader = DataLoader(
    RatingsDatasetAllData(
        users,
        movies,
        "../train_data/total.csv",
        "../train_data/movies_data.csv",
        "../train_data/users_data.csv",
        "user_id",
        "movie_id",
        "rating",
        "genres",
        genres_map,
        "gender",
        sex_map,
        "age",
        age_map,
        "occupation",
        occupation_map,
    ), 
    batch_size=1024,
    num_workers=12,
    shuffle=True,
)

In [10]:
k_gmf, k_mlp, layers, lr, epochs = (32, 16, [16, 8], 0.001, 14)

fit(
    total_dataloader, None,
    k_gmf,
    k_mlp,
    layers,
    0.5,
    lr,
    epochs,
    weight_path="/home/nubol23/Documents/NCF_all_data_1",
    run_number=1,
    random_state=2,
    show_loss=True,
)

  0%|          | 0/14 [00:00<?, ?it/s]

epoch: 1, train_loss: 1.4154207664015694
epoch: 2, train_loss: 0.9083913680289559
epoch: 3, train_loss: 0.9011854833715894
epoch: 4, train_loss: 0.897998414668992
epoch: 5, train_loss: 0.8935672515907395
epoch: 6, train_loss: 0.8871945773943901
epoch: 7, train_loss: 0.8788507777301758
epoch: 8, train_loss: 0.8695382638882881
epoch: 9, train_loss: 0.8601135758327658
epoch: 10, train_loss: 0.8503731537141881
epoch: 11, train_loss: 0.8411606731953286
epoch: 12, train_loss: 0.8317585425890213
epoch: 13, train_loss: 0.8226145927534729
epoch: 14, train_loss: 0.8138608906741234


array([[1.41542077],
       [0.90839137],
       [0.90118548],
       [0.89799841],
       [0.89356725],
       [0.88719458],
       [0.87885078],
       [0.86953826],
       [0.86011358],
       [0.85037315],
       [0.84116067],
       [0.83175854],
       [0.82261459],
       [0.81386089]])

In [11]:
out_path = "/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/notebooks/Project-NeuMF/NotebooksContent/outputs"
out_name = "neumf_extra_4"

predict(
    "/home/nubol23/Documents/NCF_all_data_1/run_1/final-14-0.8138608906741234.pt",
    k_gmf=k_gmf,
    k_mlp=k_mlp,
    layer_sizes=layers,
    alpha=0.5,
    out_path=out_path,
    out_name=out_name,
)

In [12]:
pd.read_csv(f"{out_path}/{out_name}.csv")

Unnamed: 0,id,rating
0,0,3.154346
1,1,3.406947
2,2,2.884623
3,3,3.544275
4,4,3.179825
...,...,...
3965,3965,2.132975
3966,3966,4.596799
3967,3967,5.000000
3968,3968,4.621464
