In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import KFold
from utils import create_matrix_from_raw, user_movies_pred, extract_submission
from utils import RAND_SEED

## Stacking Ensemble - Blending
Assuming we have $n$ models and the reconstructed matrix by the
$i$-th model is denoted by $\hat{A}$, the final prediction would be
$$\hat{A} = w_0 + \sum_{i=1}^{n}w_i \hat{A}^i$$

$\hat{A}^i$ fixed, only learn $w_i$ for $i = 0, 1, ..., n$

We split the data into training and holdout set (say 90% - 10%). The level-0 $n$ models are trained on the training set. The weights $w_i$ of the level-1 ensemble model are then learned on the holdout set.

### Improvement
- K=10 Fold Cross Validation, and average over weights to get final weights of Ensemble
- All models were produced as shown in generate_ensembles.ipynb

In [None]:
class Ensemble():
    
    def __init__(self, num_folds=10):
        self.num_folds = num_folds
        if self.num_folds not in range(1,11):
            self.num_folds = 1
        self.data_pd = pd.read_csv("./data/data_train.csv")
        self.sample_pd = pd.read_csv("./data/sampleSubmission.csv")
        self.kf = KFold(n_splits=10, shuffle=True, random_state=RAND_SEED)
        self.weights = None
        self.avg_weights = None
    
    def train(self):
        for idx, (train_set, val_set) in enumerate(self.kf.split(self.data_pd)):
            if idx == self.num_folds: break
            
            train_data = self.data_pd.iloc[train_set]
            val_data = self.data_pd.iloc[val_set]
            
            train_matrix = create_matrix_from_raw(train_data)
            val_users, val_movies, val_pred = user_movies_pred(val_data)
                        
            X_ensemble = np.ones((1, val_pred.shape[0]))
            
            path = "./data/ensemble/train/"+str(idx+1)+"/"
            for file in os.listdir(path):
                if file.endswith(".npy"):
                    rec_A = np.load(os.path.join(path, file))
                    pred = rec_A[val_users, val_movies]
                    X_ensemble = np.vstack([X_ensemble, pred])
            X_ensemble = X_ensemble.T
            
            b, _, _, _ = np.linalg.lstsq(X_ensemble, val_pred, rcond=None)
            if self.weights is None:
                self.weights = b
            else:
                self.weights = np.vstack([self.weights, b])
        
        self.avg_weights = self.weights.sum(axis=0)/self.num_folds
    
    def reconstruct_matrix(self):
        """
        Reconstruct matrix based on the ensemble weights, on full trained models
        """
        rec_A = None
        path = "./data/ensemble/final/"
        idx = 1
        for file in os.listdir(path):
            if file.endswith(".npy"):
                if rec_A is None:
                    rec_A = np.load(os.path.join(path, file)) * self.avg_weights[idx]
                    print(self.avg_weights[idx])
                rec_A += np.load(os.path.join(path, file)) * self.avg_weights[idx]
                idx += 1
        rec_A += np.ones((rec_A.shape[0], rec_A.shape[1])) * self.avg_weights[0]
        return rec_A

In [None]:
model = Ensemble()
model.train()
rec_A = model.reconstruct_matrix()

In [None]:
rec_A[rec_A>5] = 5
rec_A[rec_A<1] = 1

In [None]:
extract_submission(rec_A, file="ensemble")