In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

In [2]:
users = 3974
movies = 3564

train_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv")
test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def step(
    train_data, 
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    α3,
    α4,
    λ1, 
    λ2,
    μ, bi, bu, P, Q,
):
    loss = 0
    for u, i, r in train_data:
        pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u])
        error = r - pred

        # Updating
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α2 * (error - λ1*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α3*(error*Qi - λ2*Pu)
        Q[i] += α4*(error*Pu - λ2*Qi)
            
        loss += error**2
            
    return np.sqrt(loss/len(train_data))

In [87]:
# RS HD page 171 (chrome), 84 book
def fit_svd(
    train_data, val_data, n_users, n_items, k, α1=.01, α2=.01, α3=.01, α4=.01, λ1=.01, λ2=.01, n_iters=20
):
    """
    train_data: array Nx3
    """
    val_ui = uir_val[:, :2]
    val_exp = uir_val[:, -1]
    
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    t = trange(n_iters, leave=True)
    for it in t:
#     for it in range(n_iters):
        loss = step(train_data, n_users, n_items, k, α1, α2, α3, α4, λ1, λ2, μ, bi, bu, P, Q)
        
        val_preds = predict_batch(val_ui, (μ, bu, bi, P, Q))
        val_loss = np.sqrt(1/n_val * np.sum((val_preds - val_exp)**2))
        t.set_postfix({"Loss": loss, "Val": val_loss})
    
    return μ, bu, bi, P, Q

# Fit

In [6]:
@njit
def predict(u, i, params):
    μ, bu, bi, P, Q = params
    k = P.shape[1]

    pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u])
    
    return pred

In [7]:
@njit(parallel=True, nogil=True)
def predict_batch(ui_mat, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, params)
        
    return np.clip(predictions, 1., 5.)

In [85]:
α1 = 0.005
α2 = 0.005
α3 = 0.01
α4 = 0.01
lamb_1 = 0.05
lamb_2 = 0.1
# 4, 5, 6, 50, 100, 150
k = 150
fitted_params = fit_svd(
    uir_train, uir_val, users, movies, k, 
    α1, α2, α3, α4, λ1=lamb_1, λ2=lamb_2,
    n_iters=75,
)

In [86]:
val_preds = predict_batch(uir_val[:, :2], fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

0.8614500882871435


## Multiple train

In [10]:
from joblib import Parallel, delayed
from itertools import product

In [64]:
num = 5
# alphas = np.linspace(0.003, 0.02, num=num)
# lamb_1 = np.linspace(.004, .006, num=3)
# lamb_2 = np.linspace(.015, .02, num=2)
# params_product = list(product(alphas, alphas, lamb_1, lamb_2, factors))

alphas = [0.005, 0.006, 0.007, 0.01]
lambdas = [0.008, 0.01, 0.05, 0.1]
factors = [4, 5, 6, 50, 100, 150]

params_product = list(product(alphas, alphas, lambdas, lambdas, factors))

In [72]:
def train_val(
    uir_train,
    uir_val,
    users,
    movies,
    k,
    α1,
    α2,
    α3,
    α4,
    λ1,
    λ2,
    n_iters,
):
    fitted_params = fit_svd(uir_train, uir_val, users, movies, k, α1, α2, α3, α4, λ1, λ2, n_iters)
    
    val_preds = predict_batch(uir_val[:, :2], fitted_params)
    val_expected = uir_val[:, 2]
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    return α1, α2, α3, α4, λ1, λ2, k, error

In [73]:
out = Parallel(n_jobs=12)(
    delayed(train_val)(
        uir_train, 
        uir_val,
        users, 
        movies,
        k=f, 
        α1=lr1,
        α2=lr1,
        α3=lr2,
        α4=lr2,
        λ1=lamb1, 
        λ2=lamb2, 
        n_iters=100,
    )
    for lr1, lr2, lamb1, lamb2, f in tqdm(params_product)
)

  0%|          | 0/1536 [00:00<?, ?it/s]

In [74]:
print(sorted(out, key=lambda x: x[-1]))

[(0.005, 0.005, 0.01, 0.01, 0.05, 0.1, 0.861592248314689), (0.005, 0.005, 0.01, 0.01, 0.05, 0.1, 0.8616350675531405), (0.005, 0.005, 0.01, 0.01, 0.008, 0.1, 0.8616786835092534), (0.006, 0.006, 0.01, 0.01, 0.008, 0.1, 0.8616878060881559), (0.006, 0.006, 0.01, 0.01, 0.05, 0.1, 0.8616996947089881), (0.005, 0.005, 0.01, 0.01, 0.01, 0.1, 0.8617030527372601), (0.005, 0.005, 0.007, 0.007, 0.05, 0.1, 0.8617109973971565), (0.005, 0.005, 0.01, 0.01, 0.008, 0.1, 0.8617164560353313), (0.006, 0.006, 0.01, 0.01, 0.01, 0.1, 0.861729135776594), (0.005, 0.005, 0.01, 0.01, 0.05, 0.1, 0.8617350391515219), (0.005, 0.005, 0.01, 0.01, 0.01, 0.1, 0.8617393601509175), (0.005, 0.005, 0.01, 0.01, 0.01, 0.1, 0.8617584082920631), (0.006, 0.006, 0.007, 0.007, 0.05, 0.1, 0.8617814746096667), (0.006, 0.006, 0.007, 0.007, 0.05, 0.1, 0.8618282424434232), (0.007, 0.007, 0.01, 0.01, 0.05, 0.1, 0.8618300879568617), (0.006, 0.006, 0.01, 0.01, 0.01, 0.1, 0.8618493746971276), (0.006, 0.006, 0.01, 0.01, 0.05, 0.1, 0.86186741

# Predict

In [88]:
uir_total = train_csv.drop(["timestamp"], axis=1).values

In [89]:
fitted_final_params = fit_svd(
    uir_total,
    uir_val,
    users, 
    movies, 
    k=150, 
    α1=0.005, 
    α2=0.005, 
    α3=0.01, 
    α4=0.01,  
    λ1=0.05,
    λ2=0.1, 
    n_iters=100
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [90]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, fitted_final_params)

print(test_predictions[:10])

[3.32919788 3.2503533  2.75018159 3.57997493 3.05216082 3.0176957
 3.43405484 3.28735224 3.1396555  3.16087528]


In [91]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.329198
1,1,3.250353
2,2,2.750182
3,3,3.579975
4,4,3.052161


In [92]:
out_df.to_csv("out_svd_joblib_3.csv", index=False)