In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import import_data_to_matrix_split, import_data_to_matrix, extract_submission
from utils import zscore_masked_items, get_rmse_score

# Singualar Value Thresholding / Nuclear norm relaxation

## Data Preprocessings
- Extract data to row-column format
- Impute missing data with 0
- Rating matrix A
- Observation matrix Ω
- Normalize item by item (z-scores)
$$A_{ij} = \frac{A_{ij} - \overline{A{j}}}{std(A_{j})}$$

Important: 
- Only observed entries are updated
- Mean and std is computed only over observed entries

## Projected gradient descent
Let $A = U \text{diag}(\sigma_{i}) V^{\top}$, define
$$ \text{shrink}_{\tau}(A) = U \text{diag}(\sigma_{i} - \tau)_{+} V^{\top}$$
The algorithm is:
$$A^0 = 0$$
$$A^{t+1} = A^t + \eta\Pi_{\Omega}(A - \text{shrink}_{\tau}(A^t))$$

## Reconstruction
- Shrink last $A^t$
- Then Undo the normalization

In [None]:
class SVT():
    
    def __init__(self, A, eta=1.2, tau=800, epochs=12):
        self.A = A
        self.W = (self.A > 0).astype(int)
        self.num_users, self.num_items = self.A.shape
        self.eta = eta
        self.tau = tau
        self.epochs = epochs
        
        self.norm_A, self.mean_A, self.stddev_A = zscore_masked_items(self.A, self.W)
        self.A_t = np.zeros((self.num_users , self.num_items))
    
    def train(self, test_matrix=None):
        error_progress = {
            "train_rmse": [],
            "test_rmse": [],
        }
        for epoch in tqdm(range(self.epochs)):
            self._shrinkgd()
            # self.eta = self.eta/(epoch+1)**(1/2)
            rec_A = self.reconstruct_matrix()
            train_rmse = get_rmse_score(rec_A, self.A)
            error_progress["train_rmse"].append(train_rmse)
            if test_matrix is not None:
                test_rmse = get_rmse_score(rec_A, test_matrix)
                error_progress["test_rmse"].append(test_rmse)
            # print(error_progress)
        return error_progress

    def _shrinkgd(self):
        shrinked_A_t = self._shrink()
        self.A_t = self.A_t + self.eta * self.W * (self.norm_A - shrinked_A_t)

    def _shrink(self):
        U, s, Vt = np.linalg.svd(self.A_t, full_matrices=False)
        # print(s[:10])
        s = s - self.tau
        # print("s-tau", s[:10])
        s[s < 0] = 0 #clip singular values
        # print("s clipped", s[:10])
        return np.dot(U * s, Vt)

    def reconstruct_matrix(self):
        """
        Compute the full matrix using last A_t and perform the shrinkage op and
        undo normalization.
        """
        shrinked_A_t = self._shrink()
        rec_A = shrinked_A_t
        #undo normalization
        for j in range(self.num_items):
            rec_A[:,j] *= self.stddev_A[j]
            rec_A[:,j] += self.mean_A[j]
        return rec_A

- Uncomment and play with parameters (look at the test set error)

In [None]:
# A, test_matrix = import_data_to_matrix_split()
# model = SVT(A, eta=1.2, tau=2000, epochs=28)
# model.train(test_matrix=test_matrix)

In [None]:
A = import_data_to_matrix()
model = SVT(A, eta=1.2, tau=800, epochs=12)
model.train()
rec_A = model.reconstruct_matrix()

In [None]:
rec_A[rec_A>5] = 5
rec_A[rec_A<1] = 1

In [None]:
extract_submission(rec_A, file="svt")