In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm

In [2]:
# RS HD page 171 (chrome), 84 book
@njit
def fit_funk_svd(train_data, n_users, n_items, k, α=.01, λ=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    for it in range(n_iters):
        for u, i, r in train_data:
            pred = μ + bu[u] + bi[i] + np.dot(P[u], Q[i])
            
            error = r - pred
            
            # Updating
            bu[u] += α * (error - λ*bu[u])
            bi[i] += α * (error - λ*bi[i])
            P[u], Q[i] = P[u] + α*(error*Q[i] - λ*P[u]), Q[i] + α*(error*P[u] - λ*Q[i])
    
    return μ, bu, bi, P, Q

In [3]:
@njit
def predict(u, i, params):
    μ, bu, bi, P, Q = params
    
    return μ + bu[u] + bi[i] + np.dot(P[u], Q[i])

In [4]:
users = 3974
movies = 3564

train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

# Fit

In [5]:
@njit
def predict_batch(ui_mat, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, params)
        
    return np.clip(predictions, 1., 5.)

In [6]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [20]:
fitted_params = fit_funk_svd(uir_train, users, movies, k=10, α=.001, λ=.001, n_iters=100)

In [21]:
val_preds = predict_batch(uir_val[:, :2], fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

0.893832564775578


## Fit multiple

In [32]:
min_err = np.inf
best_k = 2

for k in tqdm(range(2, 20)):
    fitted_params = fit_funk_svd(uir_train, users, movies, k=k, α=.005, λ=.02, n_iters=50)
    val_preds = predict_batch(uir_val[:, :2], fitted_params)
    val_expected = uir_val[:, 2]
    
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    print(f"Error with {k} factors: {error}")
    if min_err > error:
        min_err = error
        best_k = k

  0%|          | 0/18 [00:00<?, ?it/s]

Error with 2 factors: 0.8782470153000029
Error with 3 factors: 0.8725105466934414
Error with 4 factors: 0.8750550229422127
Error with 5 factors: 0.8736892957676095
Error with 6 factors: 0.8727804502407278
Error with 7 factors: 0.8741232913040251
Error with 8 factors: 0.8725397487566549
Error with 9 factors: 0.8726427279647022
Error with 10 factors: 0.8773889413627548
Error with 11 factors: 0.8766312746761147
Error with 12 factors: 0.8784057159690379
Error with 13 factors: 0.8804835794732176
Error with 14 factors: 0.8852395089311771
Error with 15 factors: 0.8814935535654314
Error with 16 factors: 0.8830477128909969
Error with 17 factors: 0.8840098896210096
Error with 18 factors: 0.8864164571254068
Error with 19 factors: 0.8857415631609636


In [33]:
print(best_k, min_err)

3 0.8725105466934414


# Predict

In [11]:
uir_total = train_csv.drop(["timestamp"], axis=1).values

fitted_final_params = fit_funk_svd(uir_total, users, movies, k=best_k, α=.008, λ=.01, n_iters=40)

μ, bu, bi, P, Q = fitted_final_params
print(P.shape, Q.shape)

(3974, 6) (3564, 6)


In [12]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, fitted_final_params)

print(test_predictions[:10])

[3.61832631 3.55256098 2.87740342 3.21966497 3.07892273 3.12135311
 3.46657511 3.14597566 3.24858979 3.33496846]


In [13]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.618326
1,1,3.552561
2,2,2.877403
3,3,3.219665
4,4,3.078923


In [14]:
# out_df.to_csv("out_funk_svd_cpu_2.csv", index=False)