In [1]:
import numpy as np
import pandas as pd

from numba import jit, prange
from numpy.typing import NDArray
from typing import Tuple

from typing import List

from tqdm.notebook import tqdm

For colaborative filtering, we will only be using the ratings data

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

In [3]:
print(train_csv.shape)
train_csv.head()

(535784, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1160,5,974769817
1,1,1129,3,974769817
2,1,3328,4,974769817
3,1,2659,2,974769817
4,1,980,3,974769817


## Split into train and validation

In [4]:
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [5]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
283077,2252,1625,3
193896,1666,1804,5
293979,2317,365,3
472599,3423,3312,3
190361,1652,1747,4


## Build train matrix

In [6]:
def build_matrix(df, n_users, n_items):
    ratings_mat = np.ones((n_users+1, n_items+1), dtype=np.int8)*-1

    for _, row in df.iterrows():
        u = row["user_id"]
        i = row["movie_id"]
        r = row["rating"]

        ratings_mat[u, i] = r
        
    return ratings_mat

# train_mat = build_matrix(train_data, 3974, 3564)

In [7]:
@jit(nopython=True)
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
        col 2: rating
    """
    ratings_mat = np.ones((n_users+1, n_items+1), dtype=np.int8)*-1

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
        
    return ratings_mat

In [8]:
users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)

## Compute predictions

In [12]:
@jit(nopython=True, parallel=True)
def get_means(ratings: np.array, eps=1e-15) -> List[int]:
    means = np.zeros(ratings.shape[1])
    counts = np.zeros(ratings.shape[1])
    
    for i in prange(ratings.shape[1]):
        for u in range(ratings.shape[0]):
            if ratings[u, i] != -1:
                means[i] += ratings[u, i]
                counts[i] += 1
                
    return means/(counts+eps)

@jit(nopython=True)
def sim(i: int, j: int, ratings: np.array, means: np.array, eps=1e-15) -> float:
    user_idxs = []
    
    # Get users (indexes) that rated both items i and j
    for u in range(ratings.shape[0]):
        if ratings[u, i] != -1 and ratings[u, j] != -1:
            user_idxs.append(u)
    
    # Calculate similarity        
    n_ui, n_uj, cov_ui_uj = 0, 0, 0
    for u in user_idxs:
        r_ui_r = ratings[u, i] - means[i]
        r_uj_r = ratings[u, j] - means[j]
        
        cov_ui_uj += (r_ui_r * r_uj_r)
        n_ui += r_ui_r**2
        n_uj += r_uj_r**2
        
    return cov_ui_uj/(np.sqrt(n_ui)*np.sqrt(n_uj) + eps)

@jit(nopython=True, nogil=True, parallel=True)
def calculate_similarities(ratigns, means):
    similarities = np.zeros((movies, movies))
    
    for i in prange(movies):
        for j in range(movies):
            similarities[i, j] = sim(i, j, ratigns, means)
    
    return similarities

@jit(nopython=True)
def pred(u: int, i: int, k: int, ratings: np.array, means: np.array, sim_mat: np.array, eps=1e-15) -> float:
    similarities = []
    for j in range(ratings.shape[1]):
        if i != j and ratings[u, j] != -1:
            similarities.append((sim_mat[i, j], ratings[u, j]))
            
    numerator, denominator = 0, 0
    for similarity_ij, rating_uj in sorted(similarities, reverse=True)[:k]:
        numerator += similarity_ij*rating_uj
        denominator += similarity_ij
    
    return numerator/(denominator + eps)

In [21]:
@jit(nopython=True, nogil=True, parallel=True)
def predict_batch(X, ratings, sims_mat, neighbours=5):
    """
    X: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
    """
    means = get_means(ratings)

    predictions = np.zeros(len(X), dtype=np.int8)
    
    for i in prange(len(X)):
        user_id, movie_id = X[i]
        predictions[i] = pred(user_id, movie_id, neighbours, ratings, means, sims_mat)
        
    return np.clip(predictions, 1, 5)

## Making predictions for different k values on the validation data

In [20]:
sims = calculate_similarities(ratings_mat, get_means(ratings_mat))

In [31]:
predictions_per_k = []
data_arr = validation_data.drop(["rating"], axis=1).values

for k in tqdm(range(1, 200+1)):
    predictions_per_k.append(
        predict_batch(data_arr, ratings_mat, sims, neighbours=k)
    )

  0%|          | 0/200 [00:00<?, ?it/s]

## Calculate RMSE on validation data

In [32]:
errors = []

for k in range(len(predictions_per_k)):
    num = 0
    for pred_r, real_r in zip(predictions_per_k[k], validation_data["rating"].values):
        num += (pred_r - real_r)**2
        
    rmse = np.sqrt(num/len(predictions_per_k[k]))
    
    if k % 10 == 0:
        print(f"k={k+1}: {rmse=}")
    errors.append(rmse)

k=1: rmse=1.8364215667168584
k=11: rmse=1.3214507800220998
k=21: rmse=1.2724059715469815
k=31: rmse=1.2512020965658273
k=41: rmse=1.23595348859544
k=51: rmse=1.2274753695383602
k=61: rmse=1.2201052539152881
k=71: rmse=1.2149622012326737
k=81: rmse=1.2114276491600227
k=91: rmse=1.2076161777946643
k=101: rmse=1.2047186762273332
k=111: rmse=1.202423587911242
k=121: rmse=1.2002835068603932
k=131: rmse=1.1984433290037146
k=141: rmse=1.1972084766903777
k=151: rmse=1.1955392077297313
k=161: rmse=1.193777709393322
k=171: rmse=1.1922406231782507
k=181: rmse=1.1912343822573324
k=191: rmse=1.1902625728388743


## Plot rmse

In [33]:
import plotly.graph_objects as go
import numpy as np

In [35]:
fig = go.Figure(
    data=go.Scatter(
        x=np.arange(1, 200+1),
        y=errors
    )
)
fig.show()

Choose $k=200$

# Test prediction

## Build matrix with all the data

In [36]:
total_ratings = build_matrix_jit(train_csv.drop(["timestamp"], axis=1).values, users, movies)

In [37]:
test_csv.head()

Unnamed: 0,id,user_id,movie_id,timestamp
0,0,5,2962,974769784
1,1,5,3177,974769768
2,2,5,3153,974769768
3,3,5,501,974769768
4,4,5,3159,974769768


## Predict test set

In [42]:
test_predictions = predict_batch(
    test_csv.drop(["id", "timestamp"], axis=1).values,
    total_ratings,
    sims,
    neighbours=200,
)

In [43]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3
1,1,3
2,2,3
3,3,3
4,4,3


In [44]:
out_df.to_csv("out_fc_items_200.csv", index=False)