In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import import_data_to_matrix_split, extract_submission, import_data_to_matrix
from utils import get_rmse_score

In [None]:
class IRSVD():

    def __init__(self, A, biases="mean", features=324, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=15):
        """
        Perform matrix decomposition to predict empty
        entries in a matrix.
        """
        self.A = A
        train_users, train_items = self.A.nonzero()
        self.train_entries = [(user, item, self.A[user][item]) 
                              for user, item in zip(train_users, train_items)]
        self.W = (self.A > 0).astype(int)
        self.num_users, self.num_items = self.A.shape
        self.features = features
        self.eta = eta
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.epochs = epochs
        
        # Initialize user and item latent feature matrice
        self.U = np.random.normal(scale=1./self.features, size=(self.num_users, self.features))
        self.V = np.random.normal(scale=1./self.features, size=(self.num_items, self.features))
        
        # Initialize the biases
        self.global_mean = np.sum(self.W * self.A)/np.sum(self.W)
        if biases == "zero":
            self.Bu = np.zeros(self.num_users)
            self.Bi = np.zeros(self.num_items)
        else:
            Mu = np.array([np.sum(Wu * Au)/np.sum(Wu) for Au, Wu in zip(self.A, self.W)])
            Mi = np.array([np.sum(Wi * Ai)/np.sum(Wi) for Ai, Wi in zip(self.A.T, self.W.T)])

            self.Bu = Mu - np.mean(Mu)
            self.Bi = Mi - np.mean(Mi)

        self.Bu = np.reshape(self.Bu, (self.Bu.shape[0],1))
        self.Bi = np.reshape(self.Bi, (self.Bi.shape[0],1))

    def train(self, test_matrix=None):
        # Perform stochastic gradient descent for number of epochs
        error_progress = {
            "train_rmse": [],
            "test_rmse": [],
        }
        for epoch in tqdm(range(self.epochs)):
            # shuffling will help during training
            np.random.shuffle(self.train_entries)
            # print("Entering sgd")
            self._sgd()
            # print("Finishing sgd")
            rec_A = self.reconstruct_matrix()
            train_rmse = get_rmse_score(rec_A, self.A)
            error_progress["train_rmse"].append(train_rmse)
            if test_matrix is not None:
                test_rmse = get_rmse_score(rec_A, test_matrix)
                error_progress["test_rmse"].append(test_rmse)
            # print(error_progress)
        return error_progress

    def _sgd(self):
        """
        Perform stochastic gradient descent
        """
        for user, item, rating in self.train_entries:
            # Compute prediction and error
            prediction = self.global_mean + self.Bu[user] + self.Bi[item] + np.dot(self.U[user, :], self.V[item, :].T)
            error = (rating - prediction)

            # Update biases
            self.Bu[user] += self.eta * (error - self.lambda2 * self.Bu[user])
            self.Bi[item] += self.eta * (error - self.lambda2 * self.Bi[item])

            # Update user and item feature matrices
            temp_U = np.copy(self.U[user, :])
            self.U[user, :] += self.eta * (error * self.V[item, :] - self.lambda1 * self.U[user,:])
            self.V[item, :] += self.eta * (error * temp_U - self.lambda1 * self.V[item,:])

    def reconstruct_matrix(self):
        """
        Compute the reconstructed matrix using biases, U and V
        """
        biases = self.global_mean + np.array([self.Bu.T[0]]*self.num_items).T + np.array([self.Bi.T[0]]*self.num_users)
        return biases + np.dot(self.U, self.V.T)

- Best parameters so far
$$irsvd (k=324, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=15)$$
$$irsvd (k=296, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=14)$$
$$irsvd (k=148, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=13)$$
$$irsvd (k=96, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=13)$$

In [None]:
# A, test_matrix = import_data_to_matrix_split()
# model = IRSVD(A, biases="zero", features=324, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=13)
# model.train(test_matrix=test_matrix)

In [None]:
A = import_data_to_matrix()
model = IRSVD(A, biases="mean", features=324, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=15)
model.train()
rec_A = model.reconstruct_matrix()

In [None]:
rec_A[rec_A>5] = 5
rec_A[rec_A<1] = 1

In [None]:
extract_submission(rec_A, file="irsvd")

In [None]:
data_pd = pd.read_csv("./data/data_train.csv")
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True)
print(len(data_pd))
for train_i, test_i in kf.split(data_pd):
    print(train_i)
    print(test_i)
    split = data_pd.iloc[train_i]