In [None]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd

from utils import create_matrix_from_raw, RAND_SEED
from models import SVD, IRSVD, Baseline, GBias, SVP, SVT, RSVD
from sklearn.model_selection import KFold

## Some usefull functions for anaylisis and plots

In [None]:
matplotlib.rcParams.update({'font.size': 15})
def plot_graph(xlabel, ylabel, x_vals, y_vals, name):
    fig, axs = plt.subplots(1, 1, figsize=(9, 5), sharey=True)
    axs.plot(x_vals, y_vals)
    min_yval = min(y_vals)
    min_idx = y_vals.index(min_yval)
    min_xval = x_vals[min_idx]
    axs.plot(min_xval, min_yval, "ro")
    axs.set_xlabel(xlabel)
    axs.set_ylabel(ylabel)
    axs.set_xticks(x_vals)
    fig.savefig("./experiments_out/"+name+".pdf", bbox_inches='tight')
    # fig.suptitle('Categorical Plotting')

def plot_graph2(train_err, test_err, name):
    fig, axs = plt.subplots(1, 1, figsize=(9, 5), sharey=True)
    x_vals = list(range(1,len(train_err)+1))
    axs.plot(x_vals, train_err, label="train")
    axs.plot(x_vals, test_err, label="test")
    axs.legend(loc="lower left")
    min_yval = min(test_err)
    min_idx = test_err.index(min_yval)
    min_xval = x_vals[min_idx]
    axs.plot(min_xval, min_yval, "ro")
    axs.set_xlabel("Epoch")
    axs.set_ylabel("RMSE")
    axs.set_xticks(x_vals)
    fig.savefig("./experiments_out/"+name+".pdf", bbox_inches='tight')

def analyse_results(results):
    best_epoch = None
    mean = None
    std = None
    
    min_errors = []
    for idx, result in enumerate(results):
        test_rmse_lst = result["test_rmse"]
        if idx == 0:
            min_error = min(test_rmse_lst)
            best_epoch = test_rmse_lst.index(min_error)
        else:
            min_error = test_rmse_lst[best_epoch]
        min_errors.append(min_error)
    
    min_errors = np.array(min_errors)
    mean = min_errors.mean()
    std = min_errors.std()
    return mean, std, best_epoch+1

## KFolds
- Hyperparameters tunning levereging k=10 folds cross validation

In [None]:
data_pd = pd.read_csv("./data/data_train.csv")
kf = KFold(n_splits=10, shuffle=True, random_state=RAND_SEED)
# Check whether we have the same splits
for train_set, test_set in kf.split(data_pd):
    print(train_set)
    print(test_set)

## Experiments

### SVD - Baseline 1
- In this experiment we will find the best value for k (the most important singular values)

In [None]:
print("SVD finding best k value")
data = dict()
for k in range(1,21):
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = SVD(train_matrix, K=k)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, _ = analyse_results(results)
    print(f"For k={k}: Mean={mean}, Std={std}")
    data[k] = mean
ks = list(data.keys())
scores = list(data.values())
plot_graph("Number of singular values", "Avg. RMSE", ks, scores, "svd")


### ALS (U,V decomposition) - Baseline 2
- In this experiment we will use the best value for k=3 and we will explore the number of epochs necessary until convergence.
- We will fix $λ$ to 0.1

In [None]:
# We will only perform this test on the first fold
print("Finding number of epochs until convergence")
data = dict()
results = []
for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    model = Baseline(train_matrix, K=3, lambda1=0.1, epochs=5)
    result = model.train(test_matrix=test_matrix)
    results.append(result)
    break

print(f"For k=3 and lambda=0.1")
epochs = list(range(1,6))
scores = results[0]["test_rmse"]
plot_graph("Epoch", "RMSE", epochs, scores, "als")

- Cross validation score for k=3, λ=0.1, epochs=5

In [None]:
results = []
for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    model = Baseline(train_matrix, K=3, lambda1=0.1, epochs=5)
    result = model.train(test_matrix=test_matrix)
    results.append(result)
mean, std, _ = analyse_results(results)
print(f"For k=3: Mean={mean}, Std={std}")

### Global biases (ALS2)
- In this experiment we will find the best value of $λ$ (regularization factor)
- We will also automatically find the epoch at which this method converge.

In [None]:
print("Global biases finding best lambda value")
data = dict()
for lambda1 in [0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.01]:
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = GBias(train_matrix, lambda1=lambda1, epochs=5)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, best_epoch = analyse_results(results)
    print(f"For lambda={lambda1}: Mean={mean}, Std={std}, Epoch(conv.)={best_epoch}")
    data[lambda1] = mean
lambdas = list(data.keys())
scores = list(data.values())
plot_graph("Regularization", "Avg. RMSE", lambdas, scores, "global")

### Singular value thresholding (SVT) / Nuclear norm relaxation
- In this experiment we will find the best value for $τ$.
- We will also learn for free the number of epochs necessary to converge.
- Learning rate will be set to 1.2 (reference paper)

In [None]:
print("SVT finding best tau value")
data = dict()
for tau in range(200,1600,200):
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = SVT(train_matrix, eta=1.2, tau=tau, epochs=23)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, best_epoch = analyse_results(results)
    print(f"For tau={tau}: Mean={mean}, Std={std}, Epoch(conv.)={best_epoch}")
    data[tau] = mean
taus = list(data.keys())
scores = list(data.values())
plot_graph("Threshold", "Avg. RMSE", taus, scores, "svt")

### Singular value projection (SVP)
- In this experiment we will find the best value for K
- We will also learn for free the number of epochs necessary to converge.
- Learning rate will be set to 5 (reference paper)

In [None]:
print("SVP finding best projection rank (k) value")
data = dict()
for k in range(1,21):
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = SVP(train_matrix,K=k, epochs=20)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, best_epoch = analyse_results(results)
    print(f"For k={k}: Mean={mean}, Std={std}, Epoch(conv.)={best_epoch}")
    data[k] = mean
ks = list(data.keys())
scores = list(data.values())
plot_graph("Projection rank", "Avg. RMSE", ks, scores, "svp")

### Improved Regularized SVD
- In this experiment we will learn the best number of features (k)
- We will set learning rate to 0.01, $λ_1$ to 0.02 and $λ_2$ to 0.05 (reference paper)
- We will learn epochs to converge for free

In [None]:
print("IRSVD finding best number of features (k)")
data = dict()
all_results = dict()
for k in [75, 125, 175, 225, 275, 325, 375, 425]:
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = IRSVD(train_matrix, biases="mean", features=k, eta=0.01, lambda1=0.02, lambda2=0.05, epochs=18)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, best_epoch = analyse_results(results)
    print(f"For features={k}: Mean={mean}, Std={std}, Epoch(conv.)={best_epoch}")
    print(results)
    data[k] = mean
    all_results[k] = results
ks = list(data.keys())
scores = list(data.values())
plot_graph("Number of features", "Avg. RMSE", ks, scores, "irsvd")

- In this experiment we will present the event of overfitting for the best value of features.

In [None]:
min_val = min(scores)
min_idx = scores.index(min_val)
k = ks[min_idx]
results = all_results[k][0]
plot_graph2(results["train_rmse"], results["test_rmse"], "irsvd-overfitting")

### Regularized SVD
- In this experiment we will learn the best number of features (k)
- We will set learning rate to 0.01, $λ$ to 0.02 (reference paper)
- We will learn epochs to converge for free

In [None]:
print("RSVD finding best number of features (k)")
data = dict()
for k in [75, 125, 175, 225, 275, 325, 375, 425]:
    results = []
    for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
        train_data = data_pd.iloc[train_set]
        test_data = data_pd.iloc[test_set]
        
        train_matrix = create_matrix_from_raw(train_data)
        test_matrix = create_matrix_from_raw(test_data)
        
        model = RSVD(train_matrix, features=k, eta=0.01, lambda1=0.02, epochs=18)
        result = model.train(test_matrix=test_matrix)
        results.append(result)
    mean, std, best_epoch = analyse_results(results)
    print(f"For features={k}: Mean={mean}, Std={std}, Epoch(conv.)={best_epoch}")
    data[k] = mean
ks = list(data.keys())
scores = list(data.values())
plot_graph("Number of features", "Avg. RMSE", ks, scores, "rsvd")