In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm

In [2]:
np.random.seed(42)

In [3]:
# RS HD page 171 (chrome), 84 book
@njit
def fit_funk_svd(train_data, n_users, n_items, k, α=.01, λ=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    for it in range(n_iters):
        for u, i, r in train_data:
            pred = μ + bu[u] + bi[i] + np.dot(P[u], Q[i])
            
            error = r - pred
            
            # Updating
            bu[u] += α * (error - λ*bu[u])
            bi[i] += α * (error - λ*bi[i])
            P[u], Q[i] = P[u] + α*(error*Q[i] - λ*P[u]), Q[i] + α*(error*P[u] - λ*Q[i])
    
    return μ, bu, bi, P, Q

In [4]:
@njit
def predict(u, i, params):
    μ, bu, bi, P, Q = params
    
    return μ + bu[u] + bi[i] + np.dot(P[u], Q[i])

In [5]:
users = 3974
movies = 3564

train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

# Fit

In [6]:
@njit
def predict_batch(ui_mat, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, params)
        
    return np.clip(predictions, 1., 5.)

In [7]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [8]:
fitted_params = fit_funk_svd(uir_train, users, movies, k=10, α=.001, λ=.001, n_iters=100)

In [9]:
val_preds = predict_batch(uir_val[:, :2], fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

0.890220564716749


## Fit multiple

In [22]:
min_err = np.inf
best_k = 2

for k in tqdm(range(2, 20)):
    fitted_params = fit_funk_svd(uir_train, users, movies, k=k, α=.006, λ=.02, n_iters=50)
    val_preds = predict_batch(uir_val[:, :2], fitted_params)
    val_expected = uir_val[:, 2]
    
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    print(f"Error with {k} factors: {error}")
    if min_err > error:
        min_err = error
        best_k = k

  0%|          | 0/18 [00:00<?, ?it/s]

Error with 2 factors: 0.8774102203524774
Error with 3 factors: 0.8746829552465841
Error with 4 factors: 0.8733644638686336
Error with 5 factors: 0.8693785071450706
Error with 6 factors: 0.8699140462056538
Error with 7 factors: 0.8726581105867528
Error with 8 factors: 0.8763359787339388
Error with 9 factors: 0.8764317551685534
Error with 10 factors: 0.876657929395583
Error with 11 factors: 0.8802347477820115
Error with 12 factors: 0.8816192668885297
Error with 13 factors: 0.8815450223375342
Error with 14 factors: 0.8828622251372666
Error with 15 factors: 0.88480756081031
Error with 16 factors: 0.8875911241870459
Error with 17 factors: 0.8863628514724707
Error with 18 factors: 0.8914207462204439
Error with 19 factors: 0.8927715740918283


In [23]:
print(best_k, min_err)

5 0.8693785071450706


# Predict

In [31]:
uir_total = train_csv.drop(["timestamp"], axis=1).values

fitted_final_params = fit_funk_svd(uir_total, users, movies, k=best_k, α=.006, λ=.02, n_iters=50)

μ, bu, bi, P, Q = fitted_final_params
print(P.shape, Q.shape)

(3974, 5) (3564, 5)


In [32]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, fitted_final_params)

print(test_predictions[:10])

[3.48259025 3.28477013 2.87260105 3.50625012 3.12212833 2.94511264
 3.6382356  2.94776053 3.00670225 3.36064835]


In [33]:
np.savez_compressed("svd_params_alpha_.006.npy", μ=μ, bu=bu, bi=bi, P=P, Q=Q)

In [13]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.618326
1,1,3.552561
2,2,2.877403
3,3,3.219665
4,4,3.078923


In [14]:
# out_df.to_csv("out_funk_svd_cpu_2.csv", index=False)