In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import import_data_to_matrix_split, import_data_to_matrix, extract_submission
from utils import zscore_masked_items, get_rmse_score

# Baseline Method

## Data Preprocessings
- Extract data to row-column format
- Impute missing data with 0
- Rating matrix A
- Observation matrix Ω
- Normalize item by item (z-scores)
$$A_{ij} = \frac{A_{ij} - \overline{A{j}}}{std(A_{j})}$$

Important: 
- Only observed entries are updated
- Mean and std is computed only over observed entries

## SVD & ALS initialization
- Initial U and V obtained from SVD
$$A_{nxm} = U_{nxk} V_{kxm}$$

## Alternating Least Square
- Always converge but there is no guarantee that it will converge to the optimal
- Objective function:
$$l(U, V) = \frac{1}{2}||Π_{Ω}(A - UV)||_{F}^{2} + \frac{λ}{2}(||U||_{F}^{2} + ||V||_{F}^{2})$$

### Solve
$$ v_j^* = (\sum_{i}^n {ω_{ij}u_iu_i^T + λI})^{-1} (\sum_{i}^n ω_{ij}a_{ij}u_{i})$$

$$ u_i^* = (\sum_{j}^m {ω_{ij}v_jv_j^T + λI})^{-1} (\sum_{j}^m ω_{ij}a_{ij}v_{j})$$

- Trick: Solve a system of linear equations instead of finding the inverse

$$ (\sum_{i}^n {ω_{ij}u_iu_i^T + λI})v_j^* = (\sum_{i}^n ω_{ij}a_{ij}u_{i})$$

$$ (\sum_{j}^m {ω_{ij}v_jv_j^T + λI})u_i^* = (\sum_{j}^m ω_{ij}a_{ij}v_{j})$$

- Note that $u_{i}$ is the $i^{th}$ row of $U$ and $v_{j}$ is the $j^{th}$ column of $V$.

## Reconstruction
- Reconstruct data from the result of ALS (UV)
- Then undo normalization

In [None]:
class Baseline():
    
    def __init__(self, A, K=3, lambda1=0.1, epochs=20):
        self.A = A
        self.W = (self.A > 0).astype(int)
        self.num_users, self.num_items = self.A.shape
        self.K = K
        self.lambda1 = lambda1
        self.epochs = epochs
        
        self.norm_A, self.mean_A, self.stddev_A = zscore_masked_items(self.A, self.W)

        # SVD decomposition init U and V
        U, s, Vt = np.linalg.svd(self.norm_A, full_matrices=False)
        # Using the top k eigenvalues
        S = np.zeros((self.num_items , self.num_items))
        S[:self.K, :self.K] = np.diag(s[:self.K])
        # Initialize ALS with SVD result
        # Only first k columns the rest are all set to zero
        self.U = U.dot(S)[:,:self.K]
        self.V = S.dot(Vt)[:self.K]
    
    def _loss(self):
        return ((1/2) * np.sum((self.W * (self.A - np.dot(self.U, self.V)) ** 2))
                + (self.lambda1/2) * (np.sum(self.U ** 2) + np.sum(self.V ** 2)))
    
    def train(self, test_matrix=None):
        error_progress = {
            "train_rmse": [],
            "test_rmse": [],
        }
        for epoch in tqdm(range(self.epochs)):
            self._als()
            rec_A = self.reconstruct_matrix()
            train_rmse = get_rmse_score(rec_A, self.A)
            error_progress["train_rmse"].append(train_rmse)
            if test_matrix is not None:
                test_rmse = get_rmse_score(rec_A, test_matrix)
                error_progress["test_rmse"].append(test_rmse)
            # print(error_progress)
        return error_progress
    
    def _als(self):
        for j, Wj  in enumerate(self.W.T):
            self.V[:,j] = np.linalg.solve(np.dot(self.U.T, np.dot(np.diag(Wj), self.U)) + self.lambda1 * np.eye(self.K),
                                    np.dot(self.U.T, np.dot(np.diag(Wj), self.norm_A[:, j])))
        # print("Loss l(U,V) after solving for V:", self._loss())

        for i, Wi  in enumerate(W):
            self.U[i] = np.linalg.solve(np.dot(self.V, np.dot(np.diag(Wi), self.V.T)) + self.lambda1 * np.eye(self.K),
                                np.dot(self.V, np.dot(np.diag(Wi), self.norm_A[i].T))).T
        # print("Loss l(U,V) after solving for U:", self._loss())
    
    def reconstruct_matrix(self):
        """
        Compute the full matrix using U and V from als and undo normilization.
        """
        rec_A = np.dot(self.U, self.V)
        #undo normalization
        for j in range(self.num_items):
            rec_A[:,j] *= self.stddev_A[j]
            rec_A[:,j] += self.mean_A[j]
        return rec_A

In [None]:
# A, test_matrix = import_data_to_matrix_split()
# # model = Baseline(A, K=3, lambda1=0.1, epochs=20)
# model.train(test_matrix=test_matrix)

In [None]:
A = import_data_to_matrix()
model = Baseline(A, K=3, lambda1=0.1, epochs=20)
model.train()
rec_A = model.reconstruct_matrix()

In [None]:
rec_A[rec_A>5] = 5
rec_A[rec_A<1] = 1

In [None]:
extract_submission(rec_A, file="baseline")