In [None]:
!pip install skorch

Collecting skorch
  Downloading skorch-0.15.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.3/239.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.15.0


In [None]:
import datetime
import itertools
import numpy as np
import os
import pandas as pd
import patsy
import time

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch import optim
from torch.autograd import Variable

from skorch import NeuralNet
from skorch.helper import predefined_split, SliceDataset
from skorch.callbacks import BatchScoring, Checkpoint, EarlyStopping, EpochScoring, LRScheduler, TensorBoard, ProgressBar

import tensorflow as tf
from tensorflow import summary
#%load_ext tensorboard

In [None]:
# Torch parameters
identifier = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(identifier)
print(device)

cpu


In [None]:
if not os.path.exists('ml-100k'):
    !wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
    !unzip -o ml-100k.zip

--2024-01-04 11:41:43--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-01-04 11:41:44 (16.5 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

### Dataset

In [None]:
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"
]

In [None]:
class rsdataset(Dataset):
    def __init__(self, usersfile, moviesfile, ratingsfile, nrows=None):

        # Read files
        self.movies = pd.read_csv(moviesfile, sep='|', names=['MovieID', 'Title', 'date', 'video_rl_date', 'link']+genre_cols, engine='python', encoding='latin-1')
        self.users = pd.read_csv(usersfile, sep='|', names=['UserID', 'Age', 'Gender', 'Occupation', 'Zipcode'], engine='python', encoding='latin-1')
        self.ratings = pd.read_csv(ratingsfile, sep='\t', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', nrows=nrows, encoding='latin-1')

        df2 = self.movies[genre_cols]
        df2['Genre'] = df2.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)
        self.movies['Genre'] = df2['Genre']
        self.movies = self.movies.drop(genre_cols, axis = 1)
        # self.movies['Genre'] = self.movies['Genre'].map(genre_dict)
        bins = [0, 18, 25, 35, 45, 50, 56, 100]
        labels = [1, 18, 25, 35, 45, 50, 56]
        self.users['Age'] = pd.cut(self.users['Age'], bins=bins, labels=labels, right=False)
        assert self.users['UserID'].nunique() >= self.ratings['UserID'].nunique(), 'UserID with unknown information'
        assert self.movies['MovieID'].nunique() >= self.ratings['MovieID'].nunique(), 'Movies with unknown information'

        self.users_emb_columns = []
        self.users_ohe_columns = []
        self.movies_emb_columns = []
        self.movies_ohe_columns = []
        self.interact_columns = []

        self.movies = self.movies.drop(['date', 'video_rl_date', 'link'], axis= 1)
        self.nusers = self.ratings['UserID'].nunique()
        self.nmovies = self.ratings['MovieID'].nunique()

        self.y_range = (self.ratings['Rating'].min(), self.ratings['Rating'].max())

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):

        return (((self.users_emb[idx])),
                ((self.users_ohe[idx])),
                ((self.movies_emb[idx])),
                ((self.movies_ohe[idx])),
                ((self.interact[idx]))), (self.y[idx])

    def to_tensor(self):
        self.users_emb = torch.from_numpy(self.ratings[self.users_emb_columns].values)
        self.users_ohe = torch.tensor(self.ratings[self.users_ohe_columns].values, dtype=torch.float)
        self.movies_emb = torch.from_numpy(self.ratings[self.movies_emb_columns].values)
        self.movies_ohe = torch.tensor(self.ratings[self.movies_ohe_columns].values, dtype=torch.float)
        self.interact = torch.from_numpy(self.ratings[self.interact_columns].values)
        self.y = torch.tensor(self.y.values, dtype=torch.float)

In [None]:
train = rsdataset('ml-100k/u.user', 'ml-100k/u.item', 'ml-100k/u.data', nrows=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Genre'] = df2.apply(lambda row: ', '.join(row.index[row == 1]), axis=1)


### Preprocessing of dataset

In [None]:
train.ratings = train.ratings.merge(train.movies, left_on='MovieID', right_on='MovieID')
train.movies = train.ratings[train.movies.columns]

train.ratings = train.ratings.merge(train.users, left_on='UserID', right_on='UserID')
train.users = train.ratings[train.users.columns]

train.y = train.ratings['Rating']

In [None]:
# Label Encode users
columns = ['UserID', 'Gender', 'Age', 'Occupation']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.users_emb_columns = train.users_emb_columns + columns

In [None]:
# Label Encode movies
columns = ['MovieID']
train.ratings[columns] = train.ratings[columns].apply(preprocessing.LabelEncoder().fit_transform)
train.movies_emb_columns = train.movies_emb_columns + columns

In [None]:
# One Hot Encode users
columns = ['Gender', 'Age', 'Occupation']
ohe = preprocessing.OneHotEncoder(categories='auto', sparse=False, dtype='uint8')
ohe.fit(train.ratings[columns])
train.ratings = pd.concat([train.ratings, pd.DataFrame(data=ohe.transform(train.ratings[columns]), columns=ohe.get_feature_names_out(columns))], axis=1)
train.users_ohe_columns = ohe.get_feature_names_out(columns)

assert train.ratings[train.users_ohe_columns].max().max()<=1, 'Error with ohe columns'

In [None]:
# One Hot Encode movies
genres = ["genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"]

for genre in genres:
    genre = genre.replace('-', '')
    column = str(genre)
    train.ratings[column] = train.ratings['Genre'].apply(lambda x: 1 if genre in x else 0)
    train.movies_ohe_columns.append(column)

assert train.ratings[train.movies_ohe_columns].max().max()<=1, 'Error with ohe columns'

In [None]:
int_genres_gender = ""
for genre in train.movies_ohe_columns:
    int_genres_gender = int_genres_gender + '+' +genre + ':Gender'

int_genres_age = ""
for genre in train.movies_ohe_columns:
    int_genres_age = int_genres_age + '+' + genre + ':Age'

interact = patsy.dmatrix("0 + Gender:Age + Gender:Occupation + Age:Occupation"+int_genres_gender+int_genres_age, data=train.ratings.astype('object'), return_type='dataframe').astype('int8')
interact = interact.astype('uint8')
train.ratings = pd.concat([train.ratings, interact], axis=1)
train.interact_columns = interact.columns

In [None]:
interact


Unnamed: 0,Gender[0]:Age[0],Gender[1]:Age[0],Gender[0]:Age[1],Gender[1]:Age[1],Gender[0]:Age[2],Gender[1]:Age[2],Gender[0]:Age[3],Gender[1]:Age[3],Gender[0]:Age[4],Gender[1]:Age[4],...,War[T.1]:Age[T.3],War[T.1]:Age[T.4],War[T.1]:Age[T.5],War[T.1]:Age[T.6],Western[T.1]:Age[T.1],Western[T.1]:Age[T.2],Western[T.1]:Age[T.3],Western[T.1]:Age[T.4],Western[T.1]:Age[T.5],Western[T.1]:Age[T.6]
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Drop unused columns
train.movies.drop(['Title', 'Genre'], inplace=True, axis=1)
train.ratings.drop(['Title', 'Genre', 'Zipcode'], inplace=True, axis=1)

In [None]:
train.to_tensor()

### DataLoaders

In [None]:
# Split
train_size = int(0.8 * len(train))
test_size = len(train) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(train, [train_size, test_size])

# Create dataloaders
dataloaders = {}
dataloaders['train'] = torch.utils.data.DataLoader(train_dataset, batch_size=4096, shuffle=True)
dataloaders['valid'] = torch.utils.data.DataLoader(valid_dataset, batch_size=4096, shuffle=True)

### Define Pytorch models

In [None]:
class deepnwide(nn.Module):

    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, y_range, dropout, linear_size= 500):
        super().__init__()

        self.name = 'deepnwide'
        self.y_range = y_range

        # wide part

        # deep
        self.emb_UserID = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_Gender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.emb_Gender.weight.data.uniform_(-.01, .01)
        self.emb_Age = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.emb_Age.weight.data.uniform_(-.01, .01)
        self.emb_Occupation = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.emb_Occupation.weight.data.uniform_(-.01, .01)
        self.emb_MovieID = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)

        # hidden layers
        self.h1 = nn.Linear(5 * size_emb, linear_size)
        self.h2 = nn.Linear(linear_size, linear_size)
        self.h3 = nn.Linear(linear_size, linear_size)

        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        self.dropout3 = nn.Dropout(p=dropout)

        # final dense layer
        self.last_layer = nn.Linear((interact.shape[1]) + (movies_ohe.shape[1]) + (linear_size), 1)


    def forward(self, X):
        # Assign data
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]

        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        UserID = self.emb_UserID(UserID)
        Gender = self.emb_Gender(Gender)
        Age = self.emb_Age(Age)
        Occupation = self.emb_Occupation(Occupation)
        MovieID = self.emb_MovieID(MovieID)

        emb = torch.cat([UserID,
                         Age,
                         Gender,
                         Occupation,
                         MovieID],
                         dim=1)

        emb = F.relu(self.dropout1(self.h1(emb)))
        emb = F.relu(self.dropout2(self.h2(emb)))
        emb = F.relu(self.dropout3(self.h3(emb)))

        result = self.last_layer(torch.cat([interact.float(), movie_ohe.float(), emb.float()], dim=1))

        return (torch.sigmoid(result) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()


model = deepnwide(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, train.y_range, 0.5, 100)
model.to(device)
print(model)

deepnwide(
  (emb_UserID): Embedding(943, 60)
  (emb_Gender): Embedding(2, 60)
  (emb_Age): Embedding(7, 60)
  (emb_Occupation): Embedding(21, 60)
  (emb_MovieID): Embedding(1682, 60)
  (h1): Linear(in_features=300, out_features=100, bias=True)
  (h2): Linear(in_features=100, out_features=100, bias=True)
  (h3): Linear(in_features=100, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (dropout3): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=445, out_features=1, bias=True)
)


In [27]:
class twoembeds(torch.nn.Module):

    def __init__(self, size_emb, y_range):
        super().__init__()

        # set name of model
        self.name = 'twoembeds'
        self.y_range = y_range

        # User and movie embeddings
        self.emb_UserID = nn.Embedding(train.nusers, size_emb)
        self.emb_MovieID = nn.Embedding(train.nmovies, size_emb)
        self.emb_UserID.weight.data.uniform_(-.01, .01)
        self.emb_MovieID.weight.data.uniform_(-.01, .01)

        # User and movie embeddings weights
        self.emb_UserID_b = nn.Embedding(train.nusers, 1)
        self.emb_MovieID_b = nn.Embedding(train.nmovies, 1)
        self.emb_UserID_b.weight.data.uniform_(-.01, .01)
        self.emb_MovieID_b.weight.data.uniform_(-.01, .01)


    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]

        UserID = user_emb[:, 0]
        MovieID = movie_emb[:, 0]

        user_emb = self.emb_UserID(UserID)
        movie_emb = self.emb_MovieID(MovieID)

        mult = (user_emb * movie_emb).sum(1)

        # add bias
        multb = mult + self.emb_UserID_b(UserID).squeeze() + self.emb_MovieID_b(MovieID).squeeze()

        multb = multb.float()

        return (torch.sigmoid(multb) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()

        return multb


model = twoembeds(15, train.y_range)
model.to(device)
print(model)

twoembeds(
  (emb_UserID): Embedding(943, 15)
  (emb_MovieID): Embedding(1682, 15)
  (emb_UserID_b): Embedding(943, 1)
  (emb_MovieID_b): Embedding(1682, 1)
)


In [None]:
class ncf(torch.nn.Module):

    def __init__(self, users_emb, movies_emb, users_ohe, movies_ohe, interact, size_emb, dropout, linear_size, y_range):
        super().__init__()

        # set name of model
        self.name = 'ncf'
        self.y_range = y_range

        ### GMF part
        # user embeddings
        self.gmf_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.gmf_embuserid.weight.data.uniform_(-.01, .01)
        self.gmf_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.gmf_embgender.weight.data.uniform_(-.01, .01)
        self.gmf_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.gmf_embage.weight.data.uniform_(-.01, .01)
        self.gmf_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.gmf_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.gmf_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb*4-len(train.movies_ohe_columns))
        self.gmf_embmovieid.weight.data.uniform_(-.01, .01)


        ### MLP part
        # user embeddings
        self.mlp_embuserid = nn.Embedding(len(torch.unique(users_emb[:, 0])), size_emb)
        self.mlp_embuserid.weight.data.uniform_(-.01, .01)
        self.mlp_embgender = nn.Embedding(len(torch.unique(users_emb[:, 1])), size_emb)
        self.mlp_embgender.weight.data.uniform_(-.01, .01)
        self.mlp_embage = nn.Embedding(len(torch.unique(users_emb[:, 2])), size_emb)
        self.mlp_embage.weight.data.uniform_(-.01, .01)
        self.mlp_embocc = nn.Embedding(len(torch.unique(users_emb[:, 3])), size_emb)
        self.mlp_embocc.weight.data.uniform_(-.01, .01)
        # movie embeddings
        self.mlp_embmovieid = nn.Embedding(len(torch.unique(movies_emb[:, 0])), size_emb)
        self.mlp_embmovieid.weight.data.uniform_(-.01, .01)
        # hidden layers
        self.h1 = nn.Linear(5*size_emb+len(train.movies_ohe_columns), linear_size)
        self.h2 = nn.Linear(linear_size, int(linear_size/2))
        #self.h3 = nn.Linear(linear_size, linear_size)
        # Dropout layers
        self.dropout1 = nn.Dropout(p=dropout)
        self.dropout2 = nn.Dropout(p=dropout)
        #self.dropout3 = nn.Dropout(p=dropout)

        # final dense layer
        self.last_layer = nn.Linear(size_emb*4+int(linear_size/2), 1)

    def forward(self, X):
        user_emb = X[0]
        user_ohe = X[1]
        movie_emb = X[2]
        movie_ohe = X[3]
        interact = X[4]

        UserID = user_emb[:, 0]
        Gender = user_emb[:, 1]
        Age = user_emb[:, 2]
        Occupation = user_emb[:, 3]
        MovieID = movie_emb[:, 0]

        # GMF part
        gmf_embuserid = self.gmf_embuserid(UserID)
        gmf_embgender = self.gmf_embgender(Gender)
        gmf_embage = self.gmf_embage(Age)
        gmf_embocc = self.gmf_embocc(Occupation)
        gmf_embmovieid = self.gmf_embmovieid(MovieID)

        gmf_user_vector = torch.cat([gmf_embuserid,
                                    gmf_embgender,
                                    gmf_embage,
                                    gmf_embocc],
                                    dim=1)

        gmf_movie_vector = torch.cat([gmf_embmovieid, movie_ohe], 1)

        gmf_vector = (gmf_user_vector * gmf_movie_vector)


        # MLP part
        mlp_embuserid = self.mlp_embuserid(UserID)
        mlp_embgender = self.mlp_embgender(Gender)
        mlp_embage = self.mlp_embage(Age)
        mlp_embocc = self.mlp_embocc(Occupation)
        mlp_movieid = self.mlp_embmovieid(MovieID)

        mlp_vector = torch.cat([mlp_embuserid,
                                mlp_embgender,
                                mlp_embage,
                                mlp_embocc,
                                mlp_movieid,
                                movie_ohe],
                                dim=1)
        mlp_vector = F.relu(self.dropout1(self.h1(mlp_vector)))
        mlp_vector = F.relu(self.dropout2(self.h2(mlp_vector)))
        #mlp_vector = F.relu(self.dropout3(self.h3(mlp_vector)))

        # Fusion
        result = torch.cat([gmf_vector, mlp_vector], dim=1)
        result = self.last_layer(result)

        #return (torch.sigmoid(result) * (5-1) + 1).squeeze
        return (torch.sigmoid(result) * (self.y_range[1]-self.y_range[0]) + self.y_range[0]).squeeze()


model = ncf(train.users_emb, train.movies_emb, train.users_ohe, train.movies_ohe, train.interact, 60, 0.5, 200, train.y_range)
model.to(device)
print(model)

ncf(
  (gmf_embuserid): Embedding(943, 60)
  (gmf_embgender): Embedding(2, 60)
  (gmf_embage): Embedding(7, 60)
  (gmf_embocc): Embedding(21, 60)
  (gmf_embmovieid): Embedding(1682, 221)
  (mlp_embuserid): Embedding(943, 60)
  (mlp_embgender): Embedding(2, 60)
  (mlp_embage): Embedding(7, 60)
  (mlp_embocc): Embedding(21, 60)
  (mlp_embmovieid): Embedding(1682, 60)
  (h1): Linear(in_features=319, out_features=200, bias=True)
  (h2): Linear(in_features=200, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (last_layer): Linear(in_features=340, out_features=1, bias=True)
)


### Skorch callbacks

In [None]:
# Earlystopping callback
earlystopping = EarlyStopping(monitor='valid_loss', patience=10, threshold=0.001)

In [None]:
# RMSE callback
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

def rmseloss(y_true, y_pred):
    #return f1_score(y_true, y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

def precision(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate Precision and Recall
    precision_scoree = precision_score(y_true_binary, y_pred_binary)
    return precision_scoree

def recall(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate Precision and Recall
    recall_scoree = recall_score(y_true_binary, y_pred_binary)
    return recall_scoree

def f1(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)

    # Calculate F1 score
    f1_scoree = f1_score(y_true_binary, y_pred_binary)
    return f1_scoree

rmse_scorer = make_scorer(rmseloss)
precision_scorer = make_scorer(precision)
recall_scorer = make_scorer(recall)
f1_scorer = make_scorer(f1)

epoch_rmse = EpochScoring(rmse_scorer, name='rmse_score', lower_is_better=True)
epoch_precision = EpochScoring(precision_scorer, name='precision', lower_is_better= False)
epoch_recall = EpochScoring(recall_scorer, name='recall', lower_is_better= False)
epoch_f1 = EpochScoring(f1_scorer, name='f1', lower_is_better= False)


In [None]:
# Checkpoint callback
checkpoint = Checkpoint(monitor='rmse_score_best', f_params='params.pt', f_optimizer='optimizer.pt', f_history='history.json', f_pickle='model')

In [None]:
# Learning rate scheduler callback
lr_scheduler = LRScheduler(policy="StepLR", step_size=7, gamma=0.1)

### Neural Collaborative Filtering

#### Manually specify hyperparamers

In [None]:
#86.57
ncfnet = NeuralNet(
    ncf,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_ohe=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=500,
    module__dropout=0.5,
    module__linear_size=1200,
    module__y_range=train.y_range,#### Manually specify hyperparamers
    max_epochs=50,
    lr=0.001,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=1024,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               checkpoint,
               lr_scheduler,
               #TensorBoard(writer),
               #progressbar
               ]
)

In [None]:
ncfnet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss    cp      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ----  ------  ------
      1  [36m0.4843[0m       [32m0.8280[0m    [35m0.3423[0m        [31m0.9446[0m        [94m1.0215[0m        [36m0.8923[0m     +  0.0010  3.7076
      2  0.4783       [32m0.8512[0m    0.3326        [31m0.9252[0m        [94m0.8626[0m        [36m0.8559[0m     +  0.0010  4.0770
      3  [36m0.5348[0m       0.8506    [35m0.3900[0m        [31m0.9142[0m        [94m0.7350[0m        [36m0.8357[0m     +  0.0010  4.0622
      4  [36m0.5927[0m       0.8345    [35m0.4595[0m        [31m0.9130[0m        [94m0.5401[0m        [36m0.8336[0m     +  0.0010  4.2478
      5  [36m0.5978[0m       0.8247    [35m0.4688[0m        0.9233        [94m0.3581[0m        0.8525        0.0010  4.1074
      6  [36m0.6014[0m       0.8183    [35m0.4754[0m        0.9312  

<class 'skorch.net.NeuralNet'>[initialized](
  module_=ncf(
    (gmf_embuserid): Embedding(943, 500)
    (gmf_embgender): Embedding(2, 500)
    (gmf_embage): Embedding(7, 500)
    (gmf_embocc): Embedding(21, 500)
    (gmf_embmovieid): Embedding(1682, 1981)
    (mlp_embuserid): Embedding(943, 500)
    (mlp_embgender): Embedding(2, 500)
    (mlp_embage): Embedding(7, 500)
    (mlp_embocc): Embedding(21, 500)
    (mlp_embmovieid): Embedding(1682, 500)
    (h1): Linear(in_features=2519, out_features=1200, bias=True)
    (h2): Linear(in_features=1200, out_features=600, bias=True)
    (dropout1): Dropout(p=0.5, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
    (last_layer): Linear(in_features=2600, out_features=1, bias=True)
  ),
)

#### GridSearchCV

In [None]:
params = {
    'lr': [0.01],
    'module__size_emb': [30, 60, 120],
    'module__dropout': [0.3],
    'module__linear_size': [200, 400]
}
gs = GridSearchCV(ncfnet,
                  params,
                  verbose=50,
                  refit=False,
                  pre_dispatch=2,
                  n_jobs=1,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3; 1/6] START lr=0.01, module__dropout=0.3, module__linear_size=200, module__size_emb=30
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss    cp      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ----  ------  ------
      1  [36m0.4183[0m       [32m0.8544[0m    [35m0.2770[0m        [31m0.9731[0m        [94m0.9965[0m        [36m0.9469[0m     +  0.0100  7.2017
      2  [36m0.6149[0m       [32m0.9057[0m    [35m0.4654[0m        [31m0.8364[0m        [94m0.7648[0m        [36m0.6995[0m     +  0.0100  7.4854
      3  [36m0.6850[0m       [32m0.9193[0m    [35m0.5458[0m        [31m0.7758[0m        [94m0.4897[0m        [36m0.6018[0m     +  0.0100  5.8225
      4  [36m0.7077[0m       [32m0.9300[0m    [35m0.5711[0m        [31m0.7475[0m        [94m0.3499[0m        [36m0.5587[0m     +  0.0100  7.6369
      5  [

### Deep and Wide

#### Manually specify hyperparamers

In [None]:
deepnwidenet = NeuralNet(
    deepnwide,
    module__users_emb=train.users_emb,
    module__movies_emb=train.movies_emb,
    module__users_ohe=train.users_ohe,
    module__movies_ohe=train.movies_ohe,
    #### Manually specify hyperparamers e=train.movies_ohe,
    module__interact=train.interact,
    module__size_emb=30,
    module__y_range=train.y_range,
    module__dropout=0.2,
    max_epochs=30,
    lr=0.001,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=1024,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[
               earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               #checkpoint,
               lr_scheduler,
               #TensorBoard(writer),
               #progressbar
               ]
)

In [None]:
deepnwidenet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.5067[0m       [32m0.8408[0m    [35m0.3626[0m        [31m0.9640[0m        [94m1.0904[0m        [36m0.9294[0m  0.0010  7.7631
      2  [36m0.5385[0m       0.8312    [35m0.3982[0m        [31m0.9474[0m        [94m0.8823[0m        [36m0.8976[0m  0.0010  2.9171
      3  0.4556       [32m0.8532[0m    0.3107        [31m0.9453[0m        [94m0.8592[0m        [36m0.8936[0m  0.0010  2.8487
      4  [36m0.5504[0m       0.8346    [35m0.4106[0m        [31m0.9405[0m        [94m0.8501[0m        [36m0.8845[0m  0.0010  3.6234
      5  0.4931       0.8494    0.3474        [31m0.9397[0m        [94m0.8375[0m        [36m0.8830[0m  0.0010  3.3253
      6  0.4781       [32m0.8581[0m    0.3313        [31m0.9387[0m        [94m0.8264[0m        [36m0.8811[

<class 'skorch.net.NeuralNet'>[initialized](
  module_=deepnwide(
    (emb_UserID): Embedding(943, 30)
    (emb_Gender): Embedding(2, 30)
    (emb_Age): Embedding(7, 30)
    (emb_Occupation): Embedding(21, 30)
    (emb_MovieID): Embedding(1682, 30)
    (h1): Linear(in_features=150, out_features=500, bias=True)
    (h2): Linear(in_features=500, out_features=500, bias=True)
    (h3): Linear(in_features=500, out_features=500, bias=True)
    (dropout1): Dropout(p=0.2, inplace=False)
    (dropout2): Dropout(p=0.2, inplace=False)
    (dropout3): Dropout(p=0.2, inplace=False)
    (last_layer): Linear(in_features=845, out_features=1, bias=True)
  ),
)

#### GridsearchCV

In [None]:
# params = {
#     'lr': [0.001, 0.01],
#     'module__size_emb': [30, 60, 120],
#     'module__dropout': [0.5],
#     'module__linear_size': [400, 500, 600]
# }
params = {
    'lr': [0.01, 0.001],
    'module__size_emb': [30, 60, 120],
    'module__dropout': [0.3, 0.5],
    'module__linear_size': [400, 500, 600]
}
gs = GridSearchCV(deepnwidenet,
                  params,
                  verbose=50,
                  refit=False,
                  #pre_dispatch=8,
                  n_jobs=1,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3; 1/36] START lr=0.01, module__dropout=0.3, module__linear_size=400, module__size_emb=30
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.3928[0m       [32m0.8538[0m    [35m0.2551[0m        [31m0.9929[0m        [94m0.9993[0m        [36m0.9858[0m  0.0100  3.6870
      2  [36m0.4698[0m       0.8389    [35m0.3262[0m        [31m0.9738[0m        [94m0.8905[0m        [36m0.9482[0m  0.0100  3.6836
      3  [36m0.5845[0m       0.8091    [35m0.4575[0m        0.9857        [94m0.8617[0m        0.9717  0.0100  4.2670
      4  0.5128       0.8349    0.3700        [31m0.9646[0m        [94m0.8515[0m        [36m0.9305[0m  0.0100  3.6281
      5  [36m0.6784[0m       0.7743    [35m0.6037[0m        1.0065        [94m0.8368[0m        1.0131

### Two embeddings - Basic matrix factorization

#### Manually specify hyperparamers

In [28]:
twoembedsnet = NeuralNet(
    twoembeds,
    module__size_emb=128,
    module__y_range=train.y_range,
    max_epochs=30,
    lr=0.001,
    optimizer=torch.optim.Adam,
    criterion=torch.nn.MSELoss,
    device=device,
    iterator_train__batch_size=4096,
    iterator_train__num_workers=0,
    iterator_train__shuffle=True,
    iterator_valid__batch_size=4096,
    train_split=predefined_split(valid_dataset),
    callbacks=[earlystopping,
               epoch_rmse,
               epoch_precision,
               epoch_recall,
               epoch_f1,
               #checkpoint,
               lr_scheduler]
)

In [None]:
twoembedsnet.fit(train_dataset)

  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr     dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  ------
      1  [36m0.0000[0m       [32m0.0000[0m    [35m0.0000[0m        [31m1.2246[0m        [94m1.5324[0m        [36m1.4996[0m  0.0010  3.9334


  _warn_prf(average, modifier, msg_start, len(result))


      2  0.0000       0.0000    0.0000        [31m1.1950[0m        [94m1.4785[0m        [36m1.4281[0m  0.0010  6.3667


  _warn_prf(average, modifier, msg_start, len(result))


      3  0.0000       0.0000    0.0000        [31m1.1286[0m        [94m1.3628[0m        [36m1.2738[0m  0.0010  6.8263


  _warn_prf(average, modifier, msg_start, len(result))
Exception ignored in: <function _xla_gc_callback at 0x7e2ca9ea5900>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


      4  0.0000       0.0000    0.0000        [31m1.0460[0m        [94m1.1728[0m        [36m1.0941[0m  0.0010  4.0623


  _warn_prf(average, modifier, msg_start, len(result))


      5  [36m0.1819[0m       [32m0.8932[0m    [35m0.1013[0m        [31m0.9964[0m        [94m1.0072[0m        [36m0.9928[0m  0.0010  4.8329
      6  [36m0.3995[0m       0.8504    [35m0.2611[0m        [31m0.9762[0m        [94m0.9211[0m        [36m0.9530[0m  0.0010  6.7228
      7  [36m0.4543[0m       0.8476    [35m0.3103[0m        [31m0.9645[0m        [94m0.8750[0m        [36m0.9302[0m  0.0010  4.0794
      8  [36m0.4569[0m       0.8480    [35m0.3126[0m        [31m0.9635[0m        [94m0.8496[0m        [36m0.9283[0m  0.0001  4.5573
      9  [36m0.4577[0m       0.8487    [35m0.3134[0m        [31m0.9625[0m        [94m0.8464[0m        [36m0.9265[0m  0.0001  3.8373
     10  [36m0.4592[0m       0.8484    [35m0.3148[0m        [31m0.9616[0m        [94m0.8434[0m        [36m0.9247[0m  0.0001  3.8165
     11  [36m0.4598[0m       0.8479    [35m0.3154[0m        [31m0.9608[0m        [94m0.8404[0m        [36m0.9231[0m  0.0001  4.74

<class 'skorch.net.NeuralNet'>[initialized](
  module_=twoembeds(
    (emb_UserID): Embedding(943, 128)
    (emb_MovieID): Embedding(1682, 128)
    (emb_UserID_b): Embedding(943, 1)
    (emb_MovieID_b): Embedding(1682, 1)
  ),
)

#### GridSearchCV

In [None]:
params = {
    'lr': [0.01],
    'module__size_emb': [600, 700, 800, 900, 1000]
}
gs = GridSearchCV(twoembedsnet,
                  params,
                  verbose=50,
                  refit=False,
                  #pre_dispatch=8,
                  #n_jobs=8,
                  cv=3,
                  scoring='neg_mean_squared_error')

X_ds = SliceDataset(train, idx=0)
y_ds = SliceDataset(train, idx=1)
gs.fit(X_ds, y_ds)

print(gs.best_score_, gs.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3; 1/5] START lr=0.01, module__size_emb=600...............................
  epoch      f1    precision    recall    rmse_score    train_loss    valid_loss      lr      dur
-------  ------  -----------  --------  ------------  ------------  ------------  ------  -------
      1  [36m0.5491[0m       [32m0.8208[0m    [35m0.4125[0m        [31m1.0011[0m        [94m1.2399[0m        [36m1.0023[0m  0.0100  10.2454
      2  0.5246       [32m0.9604[0m    0.3608        [31m0.8418[0m        [94m0.6454[0m        [36m0.7087[0m  0.0100  7.5932
      3  [36m0.6880[0m       [32m0.9882[0m    [35m0.5277[0m        [31m0.7527[0m        [94m0.2842[0m        [36m0.5665[0m  0.0100  7.1685
      4  [36m0.7137[0m       [32m0.9945[0m    [35m0.5566[0m        [31m0.7242[0m        [94m0.1335[0m        [36m0.5245[0m  0.0100  7.5094
      5  0.7032       [32m0.9965[0m    0.5432        [31m0.7112[0m      

### Benchmark with scikit-surprise SVD algorithm

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/772.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163765 sha256=d9f3d8dac7c135ab0e4772500fbfaa880c694df7ce5e0e381c317e20daad7ff2
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740d

In [None]:
from surprise import NormalPredictor
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, KFold

In [None]:
train[dataloaders['train'].dataset.indices][1]

tensor([4., 4., 4.,  ..., 2., 2., 2.])

In [None]:
user = train[dataloaders['train'].dataset.indices][0][0][:, 0].data.numpy()
movie = train[dataloaders['train'].dataset.indices][0][2][:, 0].data.numpy()
y = train[dataloaders['train'].dataset.indices][1].data.numpy()
df = pd.DataFrame({'user': user, 'movie': movie, 'y': y})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user', 'movie', 'y']], reader)

In [None]:
data = Dataset.load_from_df(train.ratings.loc[dataloaders['train'].dataset.indices, ['UserID', 'MovieID', 'Rating']], reader)

In [None]:
a = train.ratings.loc[dataloaders['train'].dataset.indices, ['UserID', 'MovieID', 'Rating']]
b = pd.DataFrame({'UserID': user, 'MovieID': movie, 'Rating': y})

In [None]:
trainset, testset = train_test_split(data, test_size=.25)

algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

#RMSE
accuracy.rmse(predictions)

RMSE: 0.9532


0.9532214288290356