In [1]:
import numpy as np
import pandas as pd

from numba import jit, prange
from numpy.typing import NDArray
from typing import Tuple

from typing import List

from tqdm.notebook import tqdm

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

## Split into train and validation

In [3]:
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]## Split into train and validation

## Build train matrix

In [4]:
@jit(nopython=True)
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
         col 2: rating
    """
    ratings_mat = np.zeros((n_users+1, n_items+1), dtype=np.uint8)

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
    
    return ratings_mat

In [5]:
users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)## Compute predictions

## Compute predictions

In [6]:
test_ratings = np.array([
    [4, 0, 3, 4, 0],
    [1, 2, 5, 0, 3],
    [1, 0, 0, 5, 0],
    [0, 3, 4, 5, 3],
    [2, 0, 5, 4, 5],
])

In [7]:
@jit(nopython=True, parallel=True)
def get_means(ratings: np.array, eps=1e-15) -> List[int]:
    means = np.zeros(ratings.shape[0])
    counts = np.zeros(ratings.shape[0])
    
    for u in prange(ratings.shape[0]):
        for i in range(ratings.shape[1]):
            if ratings[u, i] != 0:
                means[u] += ratings[u, i]
                counts[u] += 1
                
    return means/(counts+eps)

assert np.allclose(get_means(test_ratings).round(2), np.array([3.67, 2.75, 3, 3.75, 4])) is True

In [8]:
@jit(nopython=True)
def sim(u: int, v: int, ratings: NDArray[np.uint8], means: NDArray[np.float64], eps=1e-15):
    item_idxs = []
    
    # Get items (indexes) that are rated by both users u and v
    for i in range(ratings.shape[1]):
        if ratings[u, i] != 0 and ratings[v, i] != 0:
            item_idxs.append(i)
    
    # Calculate similarity
    n_ui, n_vi, cov_ui_vi = 0, 0, 0
    for i in item_idxs:
        r_ui_r = ratings[u, i] - means[u]
        r_vi_r = ratings[v, i] - means[v]
        
        cov_ui_vi += (r_ui_r * r_vi_r)
        n_ui += r_ui_r**2
        n_vi += r_vi_r**2
        
    return cov_ui_vi/(np.sqrt(n_ui)*np.sqrt(n_vi) + eps)

for i, expected in zip(range(1, 5), [-0.98, 0, 0.26, -0.73]):
    assert abs(round(sim(0, i, test_ratings, get_means(test_ratings)), 2) - expected) < 1e-5

In [9]:
@jit(nopython=True)
def pred(u: int, i: int, k: int, ratings: np.array, means: np.array, eps=1e-15) -> float:
    similarities = []
    for v in range(ratings.shape[0]):
        if u != v and ratings[v, i] != 0:
            similarities.append((sim(u, v, ratings, means), ratings[v, i], means[v]))
    
    numerator, denominator = 0, 0
    for similarity_uv, rating_vi, mean_v in sorted(similarities, reverse=True)[:k]:
        numerator += similarity_uv*(rating_vi - mean_v)
        denominator += similarity_uv
        
    return means[u] + numerator/(denominator + eps)
    

out_pred = pred(0, 4, 1, test_ratings, get_means(test_ratings))
assert abs(round(out_pred, 2) - 2.92) < 1e-5

## Make predictions

In [10]:
@jit(nopython=True, nogil=True, parallel=True)
def predict_batch(X, ratings, neighbours=5):
    """
    X: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
    """
    means = get_means(ratings)

    predictions = np.zeros(len(X), dtype=np.float64)
    
    for i in prange(len(X)):
        user_id, movie_id = X[i]
        predictions[i] = pred(user_id, movie_id, neighbours, ratings, means)
        
    return np.clip(predictions, 1, 5)

In [13]:
data_arr = validation_data.drop(["rating"], axis=1).values
predictions_per_k = []

for k in tqdm(range(1, 20+1)):
    predictions_per_k.append(
        predict_batch(data_arr, ratings_mat, neighbours=k)
    )

  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
errors = []

for k in range(len(predictions_per_k)):
    num = 0
    for pred_r, real_r in zip(predictions_per_k[k], validation_data["rating"].values):
        num += (pred_r - real_r)**2
        
    rmse = np.sqrt(num/len(predictions_per_k[k]))
    
    print(f"k={k+1}: {rmse=}")
    errors.append(rmse)

k=1: rmse=1.2723773184146214
k=2: rmse=1.1255631905375514
k=3: rmse=1.0630722169399351
k=4: rmse=1.028073952030296
k=5: rmse=1.0064811292891318
k=6: rmse=0.9911143013570936
k=7: rmse=0.9796397089346004
k=8: rmse=0.9709723311687054
k=9: rmse=0.963975884553812
k=10: rmse=0.9586998544376439
k=11: rmse=0.9538196200823372
k=12: rmse=0.9498844985064158
k=13: rmse=0.9468236915339332
k=14: rmse=0.9439704237296446
k=15: rmse=0.9416867804784717
k=16: rmse=0.9394471489336677
k=17: rmse=0.9373945460600306
k=18: rmse=0.9357052023339824
k=19: rmse=0.9343985244630861
k=20: rmse=0.9331652360065557


Choose $k=20$

In [16]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(
    data=go.Scatter(
        x=np.arange(1, 20+1),
        y=errors
    )
)
fig.show()

## Test prediction

In [17]:
total_ratings = build_matrix_jit(train_csv.drop(["timestamp"], axis=1).values, users, movies)

In [18]:
test_predictions = predict_batch(
    test_csv.drop(["id", "timestamp"], axis=1).values,
    total_ratings,
    neighbours=20
)

In [21]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.103972
1,1,3.190187
2,2,2.837657
3,3,3.580249
4,4,3.173072


In [22]:
out_df.to_csv("out_fc_users_2.csv", index=False)