In [1]:
import numpy as np
import pandas as pd
from numba import njit, prange
from numpy.typing import NDArray
from tqdm.notebook import tqdm

In [2]:
np.set_printoptions(suppress=True)

In [3]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]## Split into train and validation

In [6]:
@njit
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
         col 2: rating
    """
    ratings_mat = np.zeros((n_users, n_items), dtype=np.float64)

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
    
    return ratings_mat

users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)
print(ratings_mat)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
# Fill zeros with the mean of known values
mean = np.mean(train_data.values[:, 2])
for i in range(ratings_mat.shape[0]):
    for j in range(ratings_mat.shape[1]):
        if ratings_mat[i, j] == 0:
            ratings_mat[i, j] = mean
            
print(ratings_mat)

[[3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]
 [3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]
 [3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]
 ...
 [3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]
 [3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]
 [3.60451395 3.60451395 3.60451395 ... 3.60451395 3.60451395 3.60451395]]


$U$: $|users| \times |features|$\
$\Sigma$: $|features| \times |features|$\
$V$: $|features| \times |items|$

## Fitting best $k$

In [10]:
U, S, V = np.linalg.svd(ratings_mat)
print(U.shape, S.shape, V.shape)

(3974, 3974) (3564,) (3564, 3564)


In [11]:
uir_val = validation_data.values

In [12]:
init = 2
K = 100

min_error = np.inf
min_k = init
for k in tqdm(range(init, K)):
    Uk, Sk, Vk = U[:, :k], S[:k], V[:k]
    train_reconstructed = Uk@np.diag(Sk)@Vk
    
    predicted = train_reconstructed[uir_val[:,0], uir_val[:,1]]

    error = np.sqrt(np.sum((predicted-uir_val[:,2])**2)/len(uir_val))
    
    if min_error > error:
        min_error = error
        min_k = k
        
print(min_error, min_k)

  0%|          | 0/98 [00:00<?, ?it/s]

1.0144514166049041 20


## Predict

In [22]:
uir_total = train_csv.drop(["timestamp"], axis=1).values
total_ratings = build_matrix_jit(uir_total, users, movies)

print(total_ratings)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
# Fill zeros with the mean of known values
mean_total = np.mean(uir_total[:, 2])
for i in range(total_ratings.shape[0]):
    for j in range(total_ratings.shape[1]):
        if total_ratings[i, j] == 0:
            total_ratings[i, j] = mean_total
            
print(total_ratings)

[[3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]
 [3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]
 [3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]
 ...
 [3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]
 [3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]
 [3.60381422 3.60381422 3.60381422 ... 3.60381422 3.60381422 3.60381422]]


In [25]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values

In [26]:
U_total, S_total, V_total = np.linalg.svd(total_ratings)
Uk_total, Sk_total, Vk_total = U_total[:, :min_k], S_total[:min_k], V_total[:min_k]

In [27]:
total_reconstructed = Uk_total@np.diag(Sk_total)@Vk_total

In [34]:
test_predictions = total_reconstructed[ui_test[:, 0], ui_test[:, 1]]
test_predictions = np.clip(test_predictions, 1, 5)

In [35]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.590959
1,1,3.585889
2,2,3.595837
3,3,3.601659
4,4,3.586008


In [38]:
out_df.to_csv("out_svd_2.csv", index=False)