In [1]:
import pandas as pd
import numpy as np
import cupy as cp
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

In [2]:
users = 3974
movies = 3564

train_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv")
test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def get_items_rated_by_users(train_data, n_users, n_items):
    res = np.zeros((users, n_items, 1), dtype=np.uint8)
    
    for u, i, _ in train_data:
        res[u, i] = 1
    
    return res

irbu = get_items_rated_by_users(uir_train, users, movies)

In [5]:
@njit
def predict(u, i, Rus, params):
    μ, bu, bi, P, Q, Y = params
    k = P.shape[1]
    
    Ru = Rus[u]
    sqrt_Ru = np.sqrt(len(Ru))
    
    implicit_feedback = np.sum(Ru * Y, axis=0)/(sqrt_Ru+1e-15)

    pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u] + implicit_feedback)
    
    return pred

In [12]:
# @njit
def step(
    train_data, 
    Rus, 
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    λ1, 
    λ2,
    μ, bi, bu, P, Q, Y,
):
    loss = 0
    for u, i, r in tqdm(train_data):
        Ru = Rus[u]
        sqrt_Ru = cp.sqrt(len(Ru))

        implicit_feedback = cp.sum(Ru * Y, axis=0)/(sqrt_Ru+1e-15)

        pred = μ + bu[u] + bi[i] + cp.dot(Q[i], P[u] + implicit_feedback)
        error = r - pred

        # Updating
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α2 * (error - λ1*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α3*(error*Qi - λ2*Pu)
        Q[i] += α4*(error*(Pu+implicit_feedback) - λ2*Qi)

        term_1 = error*(Qi/(sqrt_Ru+1e-15))
        Y += α5*(term_1 - λ2*(Ru * Y))
            
        loss += error**2
            
    return cp.sqrt(loss/len(train_data))

In [13]:
# RS HD page 171 (chrome), 84 book
def fit_svdpp(train_data, Rus, n_users, n_items, k, α1=.01, α2=.01, α3=.01, α4=.01, α5=.01, λ1=.01, λ2=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    bu = cp.zeros(n_users, np.double)
    bi = cp.zeros(n_items, np.double)
    
    P = cp.random.normal(0, .1, (n_users, k))
    Q = cp.random.normal(0, .1, (n_items, k))
    Y = cp.random.normal(0, .1, (n_items, k))
    
    μ = cp.mean(train_data[:, 2])
    
    t = trange(n_iters, leave=True)
    for it in t:
        loss = step(train_data, Rus, n_users, n_items, k, α1, α2, α3, α4, α5, λ1, λ2, μ, bi, bu, P, Q, Y)
        t.set_postfix({"Loss": loss})
        α1 *= 0.9
        α2 *= 0.9
        α3 *= 0.9
        α4 *= 0.9
    
    return μ, bu, bi, P, Q, Y

# Fit

In [14]:
lr = .007
fitted_params = fit_svdpp(
    cp.asarray(uir_train), 
    cp.asarray(irbu), 
    users, 
    movies, 
    k=10, 
    α1=lr, α2=lr, α3=lr, α4=lr, α5=lr, λ1=.005, λ2=.015, 
    n_iters=3
)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/428627 [00:00<?, ?it/s]

  0%|          | 0/428627 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
@njit(parallel=True, nogil=True)
def predict_batch_inner(ui_mat, Rus, params, progress_hook):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, params)
        if np.isnan(predictions[it]):
            print(u, i)
            
        progress_hook.update(1)
        
    return np.clip(predictions, 1., 5.)

def predict_batch_progress_bar(ui_mat, Rus, params):
    with ProgressBar(total=len(ui_mat)) as progress:
        return predict_batch_inner(ui_mat, Rus, params, progress)
    
def predict_batch(ui_mat, Rus, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, params)
        
    return np.clip(predictions, 1., 5.)

In [None]:
# val_preds = predict_batch(uir_val[:, :2], irbu, fitted_params)
# val_expected = uir_val[:, 2]

# error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
# print(error)

## Multiple train

In [None]:
from joblib import Parallel, delayed
from itertools import product

In [None]:
num = 5
alphas = np.linspace(0.003, 0.007, num=num)
lamb_1 = np.linspace(.004, .006, num=3)
lamb_2 = np.linspace(.015, .02, num=2)

params_product = list(product(alphas, alphas, lamb_1, lamb_2))

In [None]:
def train_val(
    uir_train, 
    arr_irbu,
    users, 
    movies,
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    λ1, 
    λ2, 
    n_iters,
    uir_val,
):
    fitted_params = fit_svdpp(uir_train, arr_irbu, users, movies, k, α1, α2, α3, α4, α5,λ1, λ2, n_iters)
    
    val_preds = predict_batch(uir_val[:, :2], arr_irbu, fitted_params)
    val_expected = uir_val[:, 2]
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    return α1, α2, α3, α4, λ1, λ2, error

In [None]:
out = Parallel(n_jobs=12)(
    delayed(train_val)(
        uir_train, 
        irbu,
        users, 
        movies,
        k=5, 
        α1=lr1,
        α2=lr1,
        α3=lr2,
        α4=lr2,
        α5=lr2,
        λ1=lamb1, 
        λ2=lamb2, 
        n_iters=30,
        uir_val=uir_val,
    )
    for lr1, lr2, lamb1, lamb2 in tqdm(params_product)
)

In [None]:
print(sorted(out, key=lambda x: x[-1]))

# Predict

In [None]:
uir_total = train_csv.drop(["timestamp"], axis=1).values
irbu_total_temp = get_items_rated_by_users(uir_total, users)

irbu_total = np.ones((users, movies), dtype=int)*-1

for u in range(users):
    for idx, i in enumerate(irbu_total_temp[u]):
        irbu_total[u, idx] = i

In [None]:
fitted_final_params = fit_svdpp(
    uir_total, 
    irbu_total, 
    users, 
    movies, 
    k=5, 
    α1=.004, 
    α2=.004, 
    α3=.005, 
    α4=.006, 
    α5=.006, 
    λ1=.02,
    λ2=.01, 
    n_iters=50
)

In [None]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, irbu_total, fitted_final_params)

print(test_predictions[:10])

In [None]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

In [None]:
out_df.to_csv("out_svd++2.csv", index=False)