In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from utils import extract_for_ensemble, create_matrix_from_raw, RAND_SEED
from models import IRSVD, Baseline, GBias, SVP, SVT, RSVD

## KFolds
- Split data into 10 folds (90% train set)
- Use random state for reproducibility

In [4]:
data_pd = pd.read_csv("./data/data_train.csv")
kf = KFold(n_splits=10, shuffle=True, random_state=RAND_SEED)
# Check whether we have the same splits
for train_set, test_set in kf.split(data_pd):
    print(train_set)
    print(test_set)

[      0       1       3 ... 1176948 1176950 1176951]
[      2       7      14 ... 1176940 1176941 1176949]
[      0       1       2 ... 1176948 1176949 1176951]
[      5      12      13 ... 1176920 1176927 1176950]
[      1       2       3 ... 1176949 1176950 1176951]
[      0      36      43 ... 1176933 1176936 1176945]
[      0       1       2 ... 1176949 1176950 1176951]
[      3      10      17 ... 1176922 1176925 1176943]
[      0       1       2 ... 1176949 1176950 1176951]
[      6       9      32 ... 1176907 1176928 1176929]
[      0       1       2 ... 1176948 1176949 1176950]
[     11      26      29 ... 1176893 1176946 1176951]
[      0       1       2 ... 1176949 1176950 1176951]
[     28      35      61 ... 1176931 1176935 1176948]
[      0       1       2 ... 1176949 1176950 1176951]
[      4      15      16 ... 1176912 1176917 1176934]
[      0       2       3 ... 1176949 1176950 1176951]
[      1       8      21 ... 1176919 1176930 1176944]
[      0       1       2 ...

## Models
- Predict matrix for different models and parameters
- Save the produced matrix for ensemble (for all folds)
- Also train on entire dataset

### Improved Regularized SVD

- For ensemble training

In [5]:
params = (
    ("mean", 96, 0.01, 0.02, 0.05, 13),
    ("mean", 148, 0.01, 0.02, 0.05, 13),
    ("mean", 296, 0.01, 0.02, 0.05, 14),
    ("mean", 324, 0.01, 0.02, 0.05, 15),
    ("zero", 96, 0.01, 0.02, 0.05, 13),
    ("zero", 148, 0.01, 0.02, 0.05, 13),
    ("zero", 296, 0.01, 0.02, 0.05, 14),
    ("zero", 324, 0.01, 0.02, 0.05, 15),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        biases, features, eta, lambda1, lambda2, epochs = param
        fname = "irsvd_"+biases+"_"+str(features)
        print(param)
        model = IRSVD(train_matrix, biases=biases, features=features,
                      eta=eta, lambda1=lambda1, lambda2=lambda2, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

('mean', 96, 0.01, 0.02, 0.05, 13)


100%|██████████| 13/13 [07:40<00:00, 35.41s/it]


{'train_rmse': [0.9928676922975986, 0.9918426269254519, 0.9918486763409934, 0.991125206101243, 0.9902663735465557, 0.989069888814417, 0.9861477389209141, 0.9799428860827163, 0.9682538009695231, 0.9510397479373288, 0.9288328726413521, 0.9016282070873272, 0.8700921113027357], 'test_rmse': [1.0037524033844163, 1.0030437870005222, 1.0027006664794795, 1.0024598112025538, 1.0017754613821215, 1.0019236615015619, 1.001530768242282, 0.9992211821398828, 0.9950817199945327, 0.9909491006985983, 0.9871527051390491, 0.9848431564677214, 0.9848267504102693]}
('mean', 148, 0.01, 0.02, 0.05, 13)


100%|██████████| 13/13 [10:14<00:00, 47.29s/it]


{'train_rmse': [0.9927189913825504, 0.9921557928045148, 0.9920041332109162, 0.9914389014093807, 0.9909997278959303, 0.9898153165077408, 0.9880969785674063, 0.9835078559120553, 0.9747289443366217, 0.9606272725773228, 0.9419624595735441, 0.9180997806909194, 0.8880585561108983], 'test_rmse': [1.0038568078380468, 1.003129739023154, 1.0029745809912536, 1.0026483230072485, 1.0022573837124362, 1.001758252252774, 1.0021261854789516, 0.9997636149112271, 0.9963780177413344, 0.9921704577847621, 0.9880162152281193, 0.9848738179006072, 0.9830231978779059]}
('mean', 296, 0.01, 0.02, 0.05, 14)


100%|██████████| 14/14 [13:35<00:00, 58.28s/it]


{'train_rmse': [0.9929555485612032, 0.9922004929638176, 0.9918162511212755, 0.9917500428889585, 0.9913256999432767, 0.9909424728191467, 0.9898511177147326, 0.9872471814126621, 0.9816803819974786, 0.9715049969799117, 0.9573218712043627, 0.9394111946442293, 0.915711412196087, 0.8855183116174847], 'test_rmse': [1.0035472452012146, 1.0026357760874092, 1.002293275748977, 1.0023667431654093, 1.0021004011499084, 1.0021994196310011, 1.0018588111420783, 1.0008635748314514, 0.9981147491853798, 0.9943414328042629, 0.9898664065994027, 0.986686945253791, 0.9836349194548305, 0.9814612829018748]}
('mean', 324, 0.01, 0.02, 0.05, 15)


 67%|██████▋   | 10/15 [07:36<03:50, 46.08s/it]

- Entire dataset training

In [None]:
params = (
    ("mean", 96, 0.01, 0.02, 0.05, 13),
    ("mean", 148, 0.01, 0.02, 0.05, 13),
    ("mean", 296, 0.01, 0.02, 0.05, 14),
    ("mean", 324, 0.01, 0.02, 0.05, 15),
    ("zero", 96, 0.01, 0.02, 0.05, 13),
    ("zero", 148, 0.01, 0.02, 0.05, 13),
    ("zero", 296, 0.01, 0.02, 0.05, 14),
    ("zero", 324, 0.01, 0.02, 0.05, 15),
)

train_matrix = create_matrix_from_raw(data_pd)
for param in params:
    biases, features, eta, lambda1, lambda2, epochs = param
    fname = "irsvd_"+biases+"_"+str(features)
    print(param)
    model = IRSVD(train_matrix, biases=biases, features=features,
                    eta=eta, lambda1=lambda1, lambda2=lambda2, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)

### Baseline

- For ensemble training

In [None]:
params = (
    (3, 0.1, 3),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        K, lambda1, epochs = param
        fname = "baseline_"+str(K)+"_"+str(epochs)
        print(param)
        model = Baseline(train_matrix, K=K, lambda1=lambda1, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

- Entire dataset training

In [None]:
params = (
    (3, 0.1, 3),
)

train_matrix = create_matrix_from_raw(data_pd)
print(data_pd.shape)
for param in params:
    K, lambda1, epochs = param
    fname = "baseline_"+str(K)+"_"+str(epochs)
    print(param)
    model = Baseline(train_matrix, K=K, lambda1=lambda1, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)

### Global biases

- For ensemble training

In [None]:
params = (
    (0.001, 5),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        lambda1, epochs = param
        fname = "global_"+str(epochs)
        print(param)
        model = GBias(train_matrix, lambda1=lambda1, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

- Entire dataset training

In [None]:
params = (
    (0.001, 5),
)

train_matrix = create_matrix_from_raw(data_pd)
for param in params:
    lambda1, epochs = param
    fname = "global_"+str(epochs)
    print(param)
    model = GBias(train_matrix, lambda1=lambda1, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)

### SVProjection

- For ensemble training

In [None]:
params = (
    (5, 3, 10),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        eta, K, epochs = param
        fname = "svp_"+str(eta)+"_"+str(K)+"_"+str(epochs)
        print(param)
        model = SVP(train_matrix, eta=eta, K=K, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

- Entire dataset training

In [None]:
params = (
    (5, 3, 10),
)

train_matrix = create_matrix_from_raw(data_pd)
for param in params:
    eta, K, epochs = param
    fname = "svp_"+str(eta)+"_"+str(K)+"_"+str(epochs)
    print(param)
    model = SVP(train_matrix, eta=eta, K=K, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)

### Nuclear norm relaxation / SVT

- For ensemble training

In [None]:
params = (
    (1.2, 2000, 28),
    (1.2, 1000, 15),
    (1.2, 1500, 21),
    (1.5, 2000, 22),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        eta, tau, epochs = param
        fname = "svt_"+str(eta)+"_"+str(tau)+"_"+str(epochs)
        print(param)
        model = SVT(train_matrix, eta=eta, tau=tau, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

- Entire dataset training

In [None]:
params = (
    (1.2, 2000, 28),
    (1.2, 1000, 15),
    (1.2, 1500, 21),
    (1.5, 2000, 22),
)

train_matrix = create_matrix_from_raw(data_pd)
for param in params:
    eta, tau, epochs = param
    fname = "svt_"+str(eta)+"_"+str(tau)+"_"+str(epochs)
    print(param)
    model = SVT(train_matrix, eta=eta, tau=tau, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)

### Regularized SVD

- For ensemble training

In [None]:
params = (
    (96, 0.01, 0.02, 13),
)

for idx, (train_set, test_set) in enumerate(kf.split(data_pd)):
    train_data = data_pd.iloc[train_set]
    test_data = data_pd.iloc[test_set]
    
    train_matrix = create_matrix_from_raw(train_data)
    test_matrix = create_matrix_from_raw(test_data)
    
    for param in params:
        features, eta, lambda1, epochs = param
        fname = "rsvd_"+str(features)+"_"+str(epochs)
        print(param)
        model = RSVD(train_matrix, features=features,
                      eta=eta, lambda1=lambda1, epochs=epochs)
        print(model.train(test_matrix=test_matrix))
        rec_matrix = model.reconstruct_matrix()
        extract_for_ensemble(rec_matrix, fname, idx+1, train=True)

- Entire dataset training

In [None]:
params = (
    (96, 0.01, 0.02, 13),
)

train_matrix = create_matrix_from_raw(data_pd)
for param in params:
    features, eta, lambda1, epochs = param
    fname = "rsvd_"+str(features)+"_"+str(epochs)
    print(param)
    model = RSVD(train_matrix, features=features,
                    eta=eta, lambda1=lambda1, epochs=epochs)
    print(model.train())
    rec_matrix = model.reconstruct_matrix()
    extract_for_ensemble(rec_matrix, fname, 0, train=False)