In [1]:
from time import time
from scipy.sparse import csc_matrix
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.parameter import Parameter
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error

torch.manual_seed(1284)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
def load_data_from_csv(path='./', delimiter=',', max_user = None):
    df = pd.read_csv(path, index_col=0)
    print(df.columns)
    rows_with_nan = df[df.isna().any(axis=1)]
    print(rows_with_nan)
    df = df[['userId', 'movieId', 'rating']]

    # print( df['movieId'])
    if max_user:
        df = df.loc[df['userId'] <= max_user]
    def discrete(column):
        sorted_values = sorted(df[column].unique(), reverse=True)
        def custom_rank(value):
            return sorted_values.index(value) + 1
        return custom_rank

    # df['userId'] = df['userId'].apply(discrete('userId'))
    # df['movieId'] = df['movieId'].apply(discrete('movieId'))
    # df.to_csv('discreted_link_ratings.csv')
    n_u = df['userId'].nunique()
    n_m = df['movieId'].nunique()
    # Chia tập train từ tập dữ liệu
    train, df_temp = train_test_split(df, test_size=0.4, shuffle=True, random_state=42)

    # Chia tập test và tập validation từ tập dữ liệu còn lại
    test, val = train_test_split(df_temp, test_size=0.5, shuffle=True, random_state=42)
    print(train.shape, val.shape, test.shape, n_m, n_u)
    # print(train.iloc[0])
    # print(df.iloc[0])
    train_r = np.zeros((n_m, n_u))
    val_r = np.zeros((n_m, n_u))
    test_r = np.zeros((n_m, n_u))

    n_train = train.shape[0]  # num of training ratings
    n_val = val.shape[0]  # num of valid ratings
    n_test = test.shape[0]  # num of test ratings

    for i in range(n_train):
        # print(train.iloc[i,1]-1, train.iloc[i,0]-1)
        train_r[train.iloc[i,1]-1, train.iloc[i,0]-1] = train.iloc[i,2]

    for i in range(n_val):
        # print(train.iloc[i,1]-1, train.iloc[i,0]-1)
        val_r[val.iloc[i,1]-1, val.iloc[i,0]-1] = val.iloc[i,2]

    for i in range(n_test):
        test_r[test.iloc[i,1]-1, test.iloc[i,0]-1] = test.iloc[i,2]

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    val_m = np.greater(val_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of valid ratings: {}'.format(n_val))
    print('num of test ratings: {}'.format(n_test))
    return n_m, n_u, train_r, train_m, val_r, val_m, test_r, test_m


# Load Data

In [3]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = 'discreted_link_ratings_p0.csv'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._


In [4]:
# Data Load
path = data_path
n_m, n_u, train_r, train_m, val_r, val_m, test_r, test_m = load_data_from_csv(path=path, delimiter='\t')


Index(['userId', 'movieId', 'rating'], dtype='object')
Empty DataFrame
Columns: [userId, movieId, rating]
Index: []
(117680, 3) (39227, 3) (39227, 3) 14363 1999
data matrix loaded
num of users: 1999
num of movies: 14363
num of training ratings: 117680
num of valid ratings: 39227
num of test ratings: 39227


In [5]:
# Common hyperparameter settings
n_hid = 500 # size of hidden layers
n_dim = 5 # inner AE embedding size
n_layers = 3 # number of hidden layers
gk_size = 3 # width=height of kernel for convolution

# Hyperparameters to tune for specific case
max_epoch_p = 500 # max number of epochs for pretraining
max_epoch_f = 500 # max number of epochs for finetuning
patience_p = 5 # number of consecutive rounds of early stopping condition before actual stop for pretraining
patience_f = 10 # and finetuning
tol_p = 1e-4 # minimum threshold for the difference between consecutive values of train rmse, used for early stopping, for pretraining
tol_f = 1e-5 # and finetuning
lambda_2 = 20. # regularisation of number or parameters
lambda_s = 0.01 # regularisation of sparsity of the final matrix
dot_scale = 1 # dot product weight for global kernel

In [6]:
class Matrix_Factorization(torch.nn.Module):

    def __init__(self, n_m, n_u, n_hid):
        super().__init__()
        # output (n_m, n_u)
        self.X = nn.Parameter(torch.randn(n_m, n_hid)) # item
        self.W = nn.Parameter(torch.randn(n_u, n_hid)) # user
        
        self.bm = nn.Parameter(torch.randn(n_m, 1))
        self.bu = nn.Parameter(torch.randn(1, n_u))
    
    def forward(self):

        pre = self.X.matmul(self.W.T) + self.bm + self.bu
        return pre

class Loss(nn.Module):
    def forward(self, pred_p, reg_loss, train_m, train_r):
      # L2 loss
      diff = train_m * (train_r - pred_p)
      sqE = torch.nn.functional.mse_loss(diff, torch.zeros_like(diff))
      loss_p = sqE + reg_loss
      return loss_p

# Network Functions

In [7]:
model = Matrix_Factorization(n_m, n_u, n_hid).to(device)
model.load_state_dict(torch.load('matrix_factorization_best.pth'))

<All keys matched successfully>


# Evaluation code

In [8]:
def dcg_k(score_label, k):
    dcg, i = 0., 0
    for s in score_label:
        if i < k:
            dcg += (2**s[1]-1) / np.log2(2+i)
            i += 1
    return dcg

In [9]:
def ndcg_k(y_hat, y, k):
    score_label = np.stack([y_hat, y], axis=1).tolist()
    score_label = sorted(score_label, key=lambda d:d[0], reverse=True)
    score_label_ = sorted(score_label, key=lambda d:d[1], reverse=True)
    norm, i = 0., 0
    for s in score_label_:
        if i < k:
            norm += (2**s[1]-1) / np.log2(2+i)
            i += 1
    dcg = dcg_k(score_label, k)
    return dcg / norm

In [10]:
def call_ndcg(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, y_i.shape[0])  # user-wise calculation
        num += 1

    return ndcg_sum / num

# Training and Test Loop

In [11]:

def precision(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)
    ps = precision_score(y_true_binary, y_pred_binary)
    return ps
def recall(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)
    rs = recall_score(y_true_binary, y_pred_binary)
    return rs
def f1(y_true, y_pred):
    y_true_binary = (y_true >= 4).astype(int)
    y_pred_binary = (y_pred >= 4).astype(int)
    ps = f1_score(y_true_binary, y_pred_binary)
    return ps
def mae(y_true, y_pred):
    y_pred = np.clip(y_pred, 1., 5.)
    res = mean_absolute_error(y_true, y_pred)
    return res

def rmse(y_true, y_pred):
    y_pred = np.clip(y_pred, 1., 5.)
    res = mean_squared_error(y_true, y_pred)
    return np.sqrt(res)

In [12]:
best_rmse_ep, best_mae_ep, best_ndcg_ep = 0, 0, 0
best_rmse, best_mae, best_ndcg = float("inf"), float("inf"), 0

time_cumulative = 0
tic = time()

# Fine-Tuning
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=lambda_s)

def closure():
  optimizer.zero_grad()
  x = torch.Tensor(train_r).double().to(device)
  m = torch.Tensor(train_m).double().to(device)
  model.train()
  pred = model()
  loss = Loss().to(device)(pred, 0, m, x)
  loss.backward()
  return loss

last_rmse = np.inf
counter = 0

for i in range(300, max_epoch_f):
  optimizer.step(closure)
  model.eval()
  t = time() - tic
  time_cumulative += t

  pre = model()
  
  pre = pre.float().cpu().detach().numpy()

  error = (val_m * (np.clip(pre, 1., 5.) - val_r) ** 2).sum() / test_m.sum()  # test error
  val_rmse = np.sqrt(error)

  error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
  train_rmse = np.sqrt(error_train)

  val_mae = (val_m * np.abs(np.clip(pre, 1., 5.) - test_r)).sum() / test_m.sum()
  train_mae = (train_m * np.abs(np.clip(pre, 1., 5.) - train_r)).sum() / train_m.sum()

  val_ndcg = call_ndcg(np.clip(pre, 1., 5.), test_r)
  train_ndcg = call_ndcg(np.clip(pre, 1., 5.), train_r)

  if val_rmse < best_rmse:
      best_rmse = val_rmse
      best_rmse_ep = i+1

  if val_mae < best_mae:
      best_mae = val_mae
      torch.save(model.state_dict(), 'matrix_factorization_best.pth')
      best_mae_ep = i+1

  if best_ndcg < val_ndcg:
      best_ndcg = val_ndcg
      best_ndcg_ep = i+1

  if last_rmse-train_rmse < tol_f:
    counter += 1
  else:
    counter = 0

  last_rmse = train_rmse

  if patience_f == counter:
    print('.-^-._' * 12)
    print('FINE-TUNING')
    print('Epoch:', i+1, 'val rmse:', val_rmse, 'val mae:', val_mae, 'val ndcg:', val_ndcg)
    print('Epoch:', i+1, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
    print('Time:', t, 'seconds')
    print('Time cumulative:', time_cumulative, 'seconds')
    print('.-^-._' * 12)
    break


  if i % 50 != 0:
    continue

  print('.-^-._' * 12)
  print('FINE-TUNING')
  print('Epoch:', i, 'val rmse:', val_rmse, 'val mae:', val_mae, 'val ndcg:', val_ndcg)
  print('Epoch:', i, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
  print('Time:', t, 'seconds')
  print('Time cumulative:', time_cumulative, 'seconds')
  print(' best rmse:', best_rmse)
  print(' best mae:', best_mae)
  print(' best ndcg:', best_ndcg)
  print('.-^-._' * 12)

.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
FINE-TUNING
Epoch: 300 val rmse: 2.2958633745877934 val mae: 2.771623465075052 val ndcg: 0.8478429442103421
Epoch: 300 train rmse: 2.271706183799881 train mae: 1.9700929302830295 train ndcg: 0.8313156763362318
Time: 1.8353769779205322 seconds
Time cumulative: 1.8353769779205322 seconds
 best rmse: 2.2958633745877934
 best mae: 2.771623465075052
 best ndcg: 0.8478429442103421
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
FINE-TUNING
Epoch: 350 val rmse: 2.294636691630281 val mae: 2.7724370703707417 val ndcg: 0.847189164242488
Epoch: 350 train rmse: 2.190293239505432 train mae: 1.8563982015863134 train ndcg: 0.8409751289841386
Time: 302.7968544960022 seconds
Time cumulative: 8014.76665353775 seconds
 best rmse: 2.294636691630281
 best mae: 2.771623465075052
 best ndcg: 0.8481803306178561
.-^-._.-^-._.-^-._.-^-._.-^-._

In [13]:
print('Epoch:', best_rmse_ep, ' best rmse:', best_rmse)
print('Epoch:', best_mae_ep, ' best mae:', best_mae)
print('Epoch:', best_ndcg_ep, ' best ndcg:', best_ndcg)

Epoch: 500  best rmse: 2.2874108558515767
Epoch: 301  best mae: 2.771623465075052
Epoch: 319  best ndcg: 0.8481803306178561


In [14]:
best_model = Matrix_Factorization(n_m, n_u, n_hid).to(device)
best_model.load_state_dict(torch.load('matrix_factorization_best.pth'))
pred = best_model()
pred = pred.float().cpu().detach().numpy()
y_pred = np.extract(test_m, pred)
y_true = np.extract(test_m, test_r)

best_mae = mae(y_true, y_pred)
best_rmse = rmse(y_true, y_pred)
best_precision = precision(y_true, y_pred)
best_recall = recall(y_true, y_pred)
best_f1 = f1(y_true, y_pred)

print('data: ', data_path)
print('best_mae: ', best_mae)
print('best_rmse: ', best_rmse)
print('best_precision: ', best_precision)
print('best_recall: ', best_recall)
print('best_f1: ', best_f1)

data:  discreted_link_ratings_p0.csv
best_mae:  2.0008756559590974
best_rmse:  2.2908428935320675
best_precision:  0.4989463543861762
best_recall:  0.4300467047223664
best_f1:  0.4619415256835475
