In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from matrix_factorization import BaselineModel, KernelMF

In [54]:
df = pd.read_csv("data/private_slice_ratings_3.csv", index_col=0)
cols = ['user_id','item_id', 'old_rating','timestamp','Counts Users', 'Counts Items','old_rating']
df.columns = cols
df.drop(['Counts Users','Counts Items','old_rating'], axis=1,inplace=True )
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,A2IDCSC6NVONIZ,0972683275,5.0,1367280000
1,A3BMUBUC1N77U8,0972683275,4.0,1385164800
2,AQBLWW13U66XD,0972683275,5.0,1375574400
3,A3IIGCFLKVFW8M,0972683275,5.0,1393459200
4,A6J8D9V5S9MBE,0972683275,5.0,1306886400
...,...,...,...,...
269038,A3OOQH73VQ97VN,B00L3YHF6O,5.0,1404777600
269039,A2XCCN239AR1XK,B00L3YHF6O,5.0,1404604800
269040,A35Q0RBM3YNQNF,B00L3YHF6O,5.0,1405555200
269041,A26VF18X91983P,B00L3YHF6O,5.0,1405987200


In [51]:
X = df[['user_id','item_id']]
y = df['rating']

In [None]:
from tqdm import tqdm
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

param_dist = {'n_epochs': [5,10,20,50,100],
              'n_factors': [20,50,100,200,500],
              'lr': list(np.arange(0.001, 0.02, step=0.001)),
              'reg': list(np.arange(0.005, 0.02, step=0.001)),
             }



In [None]:
best_rmse = 10000
hparam = {}
kfold = KFold(n_splits=5)
for h in tqdm(model_selection.ParameterSampler(param_dist, n_iter=50, random_state=123456)):
    matrix_fact = KernelMF(n_epochs=h['n_epochs'], n_factors=h['n_factors'], verbose=0, lr=h['lr'], reg=h['reg'])
    rmse = 0
    for train_idx, test_idx in kfold.split(X):
        X_train, X_test = X.loc[train_idx], X.loc[test_idx]
        y_train, y_test = y.loc[train_idx], y.loc[test_idx]

        matrix_fact.fit(X_train, y_train)

        pred = matrix_fact.predict(X_test)
        rmse += mean_squared_error(y_test, pred, squared=False)
    rmse = rmse/5
    print(f"\nTest RMSE: {rmse:.4f}")
    if rmse < best_rmse:
        hparam = h
        best_rmse = rmse


    

In [None]:
hparam = {'reg': 0.009000000000000001, 'n_factors': 20, 'n_epochs': 5, 'lr': 0.017}

In [None]:
best_rmse

In [None]:
kfold = KFold(n_splits=10)
for train_idx, test_idx in kfold.split(X):
    print(train_idx)
    print(test_idx)

In [52]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10)
count = 0
for train_idx, test_idx in kfold.split(X):
    count +=1
    X_train = X.loc[train_idx].copy()
    y_train = y.loc[train_idx].copy()
    X_test = X.loc[test_idx].copy()
    y_test = y.loc[test_idx].copy()

    # Initial training
    matrix_fact = KernelMF(n_epochs=hparam['n_epochs'], n_factors=hparam['n_factors'], verbose=1, lr=hparam['lr'], reg=hparam['reg'])
    matrix_fact.fit(X_train, y_train)

    pred = matrix_fact.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    print(f"\nTest RMSE: {rmse:.4f}")
    X_test['rating'] = y_test
    X_test['prediction'] = pred
    X_test.to_csv(f"MF_test_{count}.csv",encoding='utf-8', index=False)
    



Epoch  1 / 5  -  train_rmse: 1.0002873649294757
Epoch  2 / 5  -  train_rmse: 0.9548951546146806
Epoch  3 / 5  -  train_rmse: 0.926392005441788
Epoch  4 / 5  -  train_rmse: 0.9051921347809841
Epoch  5 / 5  -  train_rmse: 0.8871427289567273

Test RMSE: 1.0422
Epoch  1 / 5  -  train_rmse: 0.9973526532461451
Epoch  2 / 5  -  train_rmse: 0.9520824424208557
Epoch  3 / 5  -  train_rmse: 0.9238485118607594
Epoch  4 / 5  -  train_rmse: 0.9028217316004638
Epoch  5 / 5  -  train_rmse: 0.8849216295691485

Test RMSE: 1.0707
Epoch  1 / 5  -  train_rmse: 0.9971656248653166
Epoch  2 / 5  -  train_rmse: 0.952152402745721
Epoch  3 / 5  -  train_rmse: 0.9236735608828675
Epoch  4 / 5  -  train_rmse: 0.9026918464609126
Epoch  5 / 5  -  train_rmse: 0.8848131339154214

Test RMSE: 1.0713
Epoch  1 / 5  -  train_rmse: 0.9987636317459362
Epoch  2 / 5  -  train_rmse: 0.9533633008032814
Epoch  3 / 5  -  train_rmse: 0.9248218571067661
Epoch  4 / 5  -  train_rmse: 0.9037631863961664
Epoch  5 / 5  -  train_rmse: 0.88

In [None]:
mask = np.random.rand(len(df)) < 0.8

X_train = X[mask].copy()
y_train = y[mask].copy()
X_test = X[~mask].copy()
y_test = y[~mask].copy()
    

In [None]:
# Initial training
matrix_fact = KernelMF(n_epochs=hparam['n_epochs'], n_factors=hparam['n_factors'], verbose=1, lr=hparam['lr'], reg=hparam['reg'])
matrix_fact.fit(X_train, y_train)

pred = matrix_fact.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
print(f"\nTest RMSE: {rmse:.4f}")


In [None]:
X_test['rating'] = y_test
X_test['prediction'] = pred
X_test