In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm

In [2]:
# RS HD page 171 (chrome), 84 book
@njit
def fit_funk_svd(train_data, n_users, n_items, k, α1=.01, α2=.01, α3=.01, α4=.01, λ=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    for it in range(n_iters):
        for u, i, r in train_data:
            pred = μ + bu[u] + bi[i] + np.dot(P[u], Q[i])
            
            error = r - pred
            
            # Updating
            bu[u] += α1 * (error - λ*bu[u])
            bi[i] += α2 * (error - λ*bi[i])
            P[u], Q[i] = P[u] + α3*(error*Q[i] - λ*P[u]), Q[i] + α4*(error*P[u] - λ*Q[i])
    
    return μ, bu, bi, P, Q

In [3]:
@njit
def predict(u, i, params):
    μ, bu, bi, P, Q = params
    
    return μ + bu[u] + bi[i] + np.dot(P[u], Q[i])

In [4]:
users = 3974
movies = 3564

train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [14]:
import torch

In [16]:
t1 = torch.randn(20, 3)
t2 = torch.randn(20, 4)

In [19]:
torch.concat([t1, t2], dim=1).shape

torch.Size([20, 7])

In [11]:
train_csv.iloc[0]["movie_id"]

1159

# Fit

In [5]:
@njit
def predict_batch(ui_mat, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, params)
        
    return np.clip(predictions, 1., 5.)

In [6]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

## Fit multiple

In [7]:
@njit
def calc_error(preds, expected):
    num = 0
    for i in range(len(preds)):
        num += (preds[i] - expected[i])**2

    rmse = np.sqrt(num/len(preds))
    return rmse

In [None]:
# np.random.seed(42)

In [8]:
min_err = np.inf
best_k = 2

α1=.006 
α2=.006
α3=.004 
α4=.004
λ=.03
params_dict = {}
# for k in tqdm(range(2, 20)):
for k in tqdm(range(2, 9)):
    fitted_params = fit_funk_svd(
        uir_train, 
        users, 
        movies, 
        k=k, 
        α1=α1, 
        α2=α2, 
        α3=α3, 
        α4=α4, 
        λ=λ, 
        n_iters=100
    )
    
    params_dict[k] = fitted_params
    val_preds = predict_batch(uir_val[:, :2], fitted_params)
    val_expected = uir_val[:, 2]
    
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
#     error = calc_error(val_preds, val_expected)
    
    print(f"Error with {k} factors: {error}")
    if min_err > error:
        min_err = error
        best_k = k

  0%|          | 0/7 [00:00<?, ?it/s]

Error with 2 factors: 0.879764419764806
Error with 3 factors: 0.8723638337398438
Error with 4 factors: 0.8692350160845315
Error with 5 factors: 0.8715894087242227
Error with 6 factors: 0.8688742752496171
Error with 7 factors: 0.8688789403628632
Error with 8 factors: 0.8714506465014964


In [56]:
print(best_k, min_err)

6 0.8669836349955178


In [None]:
# from numba.typed import Dict
# from numba.core import types

In [None]:
# @njit(parallel=True)
# def find_best_params(uir_mat, uir_val, val_exp, users, movies):
# #     d = Dict.empty(key_type=types.Tuple([types.int64, types.float64, types.float64]), value_type=types.float64)
# #     d = Dict.empty(key_type=types.UniTuple(types.float64, 3), value_type=types.float64)
#     d = Dict()
#     d[(1, 1., 1.)] = 1.
#     del d[(1, 1., 1.)]
    
#     print("Starting")
#     for k in range(2, 10):
#         print(k)
#         learning_rates = np.linspace(0.003, 0.01, 24)
#         for idx in prange(len(learning_rates)):
#             lr = learning_rates[idx]
#             for λ in np.linspace(0.001, 0.02, 10):
#                 fitted_params = fit_funk_svd(uir_mat, users, movies, k=k, α=lr, λ=λ, n_iters=50)
#                 val_preds = predict_batch(uir_val, fitted_params)
                
#                 error = calc_error(val_preds, val_exp)
                
#                 d[(k, lr, λ)] = error
                
#     return d

In [None]:
# params_dict = find_best_params(uir_train, uir_val[:, :2], uir_val[:, -1], users, movies)

# Predict

In [124]:
uir_total = train_csv.drop(["timestamp"], axis=1).values

fitted_final_params = fit_funk_svd(
    uir_total, 
    users, 
    movies, 
    k=best_k, 
    α1=α1, 
    α2=α2, 
    α3=α3, 
    α4=α4, 
    λ=λ, 
    n_iters=100
)

μ, bu, bi, P, Q = fitted_final_params
print(P.shape, Q.shape)

(3974, 6) (3564, 6)


In [125]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, fitted_final_params)

print(test_predictions[:10])

[3.99348401 3.29987161 2.91307955 3.41485166 2.33274277 3.21715346
 3.47018139 2.91642027 3.02578512 3.16292718]


In [126]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.993484
1,1,3.299872
2,2,2.91308
3,3,3.414852
4,4,2.332743


In [127]:
out_n = 7
out_df.to_csv(f"out_fine_tune/csvs/out_funk_svd_cpu_fine_tune_{out_n}.csv", index=False)

In [97]:
# np.savez_compressed(
#     "out_fine_tune/params/out_funk_svd_cpu_fine_tune_6", 
#     μ=fitted_final_params[0], 
#     bu=fitted_final_params[1], 
#     bi=fitted_final_params[2], 
#     P=fitted_final_params[3], 
#     Q=fitted_final_params[4],
# )