In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import import_data_to_matrix_split, import_data_to_matrix, extract_submission
from utils import zscore_masked_items, get_rmse_score

# Vanilla SVD
$$A = UΣV^{T}, Σ = diag(σ_1, ..., σ_{min\{n,m\}})$$
- $U \in R^{nxn}$ and $V \in R^{mxm}$, both are orthogonal matrices
- Keep only singular values up to K (zero out rest).

In [2]:
class SVD():
    
    def __init__(self, A, K=8):
        self.A = A
        self.W = (self.A > 0).astype(int)
        self.num_users, self.num_items = self.A.shape
        self.K = K
        self.U = None
        self.S = None
        self.Vt = None
        self.norm_A, self.mean_A, self.stddev_A = zscore_masked_items(self.A, self.W)

    def train(self, test_matrix=None):
        error_progress = {
            "train_rmse": [],
            "test_rmse": [],
        }
        # SVD decomposition init U and V
        self.U, s, self.Vt = np.linalg.svd(self.norm_A, full_matrices=False)
        # Using the top k eigenvalues
        self.S = np.zeros((self.num_items , self.num_items))
        self.S[:self.K, :self.K] = np.diag(s[:self.K])
        rec_A = self.reconstruct_matrix()
        train_rmse = get_rmse_score(rec_A, self.A)
        error_progress["train_rmse"].append(train_rmse)
        if test_matrix is not None:
            test_rmse = get_rmse_score(rec_A, test_matrix)
            error_progress["test_rmse"].append(test_rmse)
        return error_progress
        
    def reconstruct_matrix(self):
        """
        Compute the full matrix using U, S and V from SVD and undo normalization.
        """
        rec_A = (self.U.dot(self.S)).dot(self.Vt)
        #undo normalization
        for j in range(self.num_items):
            rec_A[:,j] *= self.stddev_A[j]
            rec_A[:,j] += self.mean_A[j]
        return rec_A


In [3]:
# A, test_matrix = import_data_to_matrix_split()
# model = SVD(A, K=12)
# model.train(test_matrix=test_matrix)

In [4]:
A = import_data_to_matrix()
model = SVD(A, K=8)
model.train()
rec_A = model.reconstruct_matrix()

In [5]:
rec_A[rec_A>5] = 5
rec_A[rec_A<1] = 1

In [6]:
extract_submission(rec_A, file="svd")