In [1]:
import sys
sys.path.append("../")

from utils.models_genre import GMF, MLP, NeuFM
from utils.dataset import RatingsDatasetGenre
from torch.utils.data import DataLoader

from tqdm.notebook import tqdm

import math

import os

import pandas as pd

import numpy as np

In [2]:
users = 3974
movies = 3564

genres_map = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

train_dataloader = DataLoader(
    RatingsDatasetGenre(
        users,
        movies,
        "../train_data/train.csv",
        "../train_data/movies_data.csv",
        "user_id",
        "movie_id",
        "rating",
        "genres",
        genres_map,
    ), 
    batch_size=256,
    num_workers=8,
    shuffle=True,
)
val_dataloader = DataLoader(
    RatingsDatasetGenre(
        users,
        movies,
        "../train_data/validation.csv",
        "../train_data/movies_data.csv",
        "user_id",
        "movie_id",
        "rating",
        "genres",
        genres_map,
    ),
    batch_size=1024,
    num_workers=4,
    shuffle=True,
)

In [3]:
import torch
import torch.nn as nn
import torch.optim as opt

In [4]:
def define_model(k_gmf, k_mlp, layer_sizes, n_genres=18, alpha=0.5):
    return NeuFM(
        GMF(users, movies, n_genres, k_gmf),
        MLP(users, movies, n_genres, k_mlp, layer_sizes, ),
        alpha=alpha,
    ).cuda()

In [5]:
def train(model, lr, epochs, train_dataloader, val_dataloader=None, show_loss=True):
    criterion = nn.MSELoss().cuda()
    optimizer = opt.Adam(model.parameters(), lr=lr)
    
    avg_losses = []
    
    model_state = None
    best_epoch = 0
    prev_val_loss = math.inf
    for epoch in tqdm(range(epochs)):
        n_batches = len(train_dataloader)
        avg_loss = 0
        val_loss = 0

        # Train step
        for i_batch, (vus, vis, rs, gs) in enumerate(train_dataloader):
            vus = vus.cuda()
            vis = vis.cuda()
            rs = rs.cuda()
            gs = gs.cuda()

            optimizer.zero_grad()
            y_hat = model(vus, vis, gs)

            loss = criterion(y_hat, rs)
            loss.backward()
            optimizer.step()

            avg_loss += math.sqrt(float(loss.detach().cpu()))
        
        avg_loss /= n_batches
        
        # Val step
        if val_dataloader is not None:
            with torch.no_grad():
                for val_vus, val_vis, val_rs, val_gs in val_dataloader:
                    val_vus = val_vus.cuda()
                    val_vis = val_vis.cuda()
                    val_rs = val_rs.cuda()
                    val_gs = val_gs.cuda()

                    val_pred = model(val_vus, val_vis, val_gs)
                    val_loss += math.sqrt(float(criterion(val_pred, val_rs).detach().cpu()))

            val_loss /= len(val_dataloader)
            if show_loss:
                print(f"epoch: {epoch+1}, train_loss: {avg_loss}, val_loss: {val_loss}")
            
            avg_losses.append([avg_loss, val_loss])
        else:
            val_loss = avg_loss
            if show_loss:
                print(f"epoch: {epoch+1}, train_loss: {avg_loss}")
            
            avg_losses.append([avg_loss])
                  
        if val_loss < prev_val_loss:
            prev_val_loss = val_loss
            model_state = model.state_dict()
            best_epoch = epoch
        
    return best_epoch, prev_val_loss, model_state, avg_losses

In [6]:
def fit(
    train_dataloader, val_dataloader,
    k_gmf, k_mlp, layer_sizes, alpha=0.5, lr=0.0005, epochs=40, 
    weight_path="/home/", run_number=1, random_state=None, show_loss=True
):
    if random_state is not None:
        torch.manual_seed(random_state)
        
    model = define_model(k_gmf, k_mlp, layer_sizes, alpha=alpha)
    
    best_epoch, val_loss, model_state, losses = train(
        model, lr, epochs, train_dataloader, val_dataloader, show_loss
    )
    
    run_path = f"{weight_path}/run_{run_number}"
    if not os.path.isdir(run_path):
        os.makedirs(run_path)
    
    if val_dataloader is None:
        torch.save(
            model_state, 
            f"{run_path}/final-{best_epoch+1}-{val_loss}.pt"
        )
    else:
        torch.save(
            model_state, 
            f"{run_path}/{best_epoch+1}-{val_loss}.pt"
        )
        
    return np.asarray(losses)

In [7]:
def predict(weight_path, k_gmf, k_mlp, layer_sizes, alpha, out_path, out_name):
    trained_model = NeuFM(
        GMF(users, movies, k_gmf),
        MLP(users, movies, k_mlp, layer_sizes),
        alpha,
    ).cuda()
    
    trained_model.load_state_dict(torch.load(weight_path))
    trained_model.cuda().eval()
    
    test_dataloader = DataLoader(
        RatingsDataset(
            "train_data/test.csv",
            "user_id",
            "movie_id",
        ), 
        batch_size=1024,
        num_workers=12
    )
    
    test_predictions = []

    for vus, vis in test_dataloader:
        vus = vus.cuda()
        vis = vis.cuda()

        pred = torch.clip(trained_model(vus, vis), 1, 5).cpu().ravel().tolist()
        test_predictions += pred
        
    test_csv = pd.read_csv("../../data/test_data.csv")
    
    out_df = pd.DataFrame.from_dict(
        {
            "id": list(test_csv["id"]),
            "rating": test_predictions
        }
    )

    out_df.to_csv(f"{out_path}/{out_name}.csv", index=False)
    out_df.head()

## Train GMF

In [9]:
losses_fit = []

for run_number, (k_gmf, k_mlp, layers, lr, epochs) in tqdm(enumerate([
#     (8, 16, [16, 8], 0.0001, 75),
#     (8, 16, [16, 8], 0.0006, 50),
#     (8, 16, [16, 8], 0.0008, 50),
#     (16, 16, [16, 8], 0.001, 50),
#     (16, 32, [32, 16, 8], 0.001, 50),
    (32, 16, [16, 8], 0.001, 50),
#     (32, 16, [16, 8], 0.005, 50), #*
#     (32, 16, [16, 8], 0.01, 50),
#     (32, 32, [32, 16, 8], 0.01, 50),
#     (32, 32, [32, 16, 8], 0.01, 100),
])):
    print(f"{k_gmf=} {k_mlp=} {layers=} {lr=} {epochs=}")
    losses_fit.append(
        fit(
            train_dataloader, val_dataloader,
            k_gmf,
            k_mlp,
            layers,
            0.5,
            lr,
            epochs,
            weight_path="/home/nubol23/Documents/NCF_genre",
            run_number=run_number,
            random_state=2,
            show_loss=True,
        )
    )

0it [00:00, ?it/s]

k_gmf=32 k_mlp=16 layers=[16, 8] lr=0.001 epochs=50


  0%|          | 0/50 [00:00<?, ?it/s]

epoch: 1, train_loss: 1.1919899234102522, val_loss: 0.9129713214695309
epoch: 2, train_loss: 0.8988403176847582, val_loss: 0.8982515985895396
epoch: 3, train_loss: 0.8753390395439855, val_loss: 0.8848790494354406
epoch: 4, train_loss: 0.8512184624635842, val_loss: 0.8755307099589033
epoch: 5, train_loss: 0.8284663373536446, val_loss: 0.8700310496176783
epoch: 6, train_loss: 0.8077858233372341, val_loss: 0.8679663377991647
epoch: 7, train_loss: 0.7896982224549364, val_loss: 0.8674825342371189
epoch: 8, train_loss: 0.7729312788253893, val_loss: 0.8681311316086033
epoch: 9, train_loss: 0.7569469410198857, val_loss: 0.869443993671719
epoch: 10, train_loss: 0.7414536130576833, val_loss: 0.8715773150330449
epoch: 11, train_loss: 0.726365367280368, val_loss: 0.8751085514911844
epoch: 12, train_loss: 0.711410951698249, val_loss: 0.8789127212516631


KeyboardInterrupt: 