Let's take a look at the data. First, we'll use ratings to create a collaborative filtering algorithm. Then, we'll use movie metadata to find similar movies.

In [None]:
import numpy as np
import pandas as pd

In [None]:
movies_metadata = pd.read_csv('./data/movies_metadata.csv')
ratings = pd.read_csv('./data/ratings_small.csv')

In [None]:
movies_metadata.head()

In [None]:
C = movies_metadata['vote_average'].mean()
m = movies_metadata['vote_count'].quantile(0.9)

q_movies = movies_metadata.copy().loc[movies_metadata['vote_count'] >= m]

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)

q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

In [None]:
ratings = pd.read_csv("./data/ratings_small.csv")
print(ratings.head())
print(len(ratings))

In [None]:
print(len(ratings[['userId']].drop_duplicates()))

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
pred = [svd.predict(1, i).est for i in range(100)]

In [None]:
print(pred)

Problem: new users or new ratings by existing users are expected to come in quickly. It's not really feasible to retrain the entire model each time a new ratings comes in. Unfortunately, common implementations of SVD of even KNN-based collaborative filtering algorithms do not support online-learning for new ratings without retraining the whole model.

For this, I'll try to implement the online-updating algorithm presented in: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.165.8010&rep=rep1&type=pdf

The implementation will follow default parameters from the surprise package: https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD

In [None]:
from scipy import sparse

values = ratings.loc[:, 'rating'].to_numpy()
rows = ratings.loc[:, 'userId'].to_numpy()
cols = ratings.loc[:, 'movieId'].to_numpy()

n_rows = rows.max() + 1
n_cols = cols.max() + 1

sparse_ratings = sparse.csr_matrix((values, (rows - 1, cols - 1)), shape=(n_rows, n_cols))

In [None]:
print(values.max())

In [None]:
import torch
from torch import nn


class KMF(nn.Module):
    def __init__(self, n_users: int, n_items: int, emb_dim: int, max_score: int):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.user_bias = nn.Parameter(torch.zeros(n_users))
        self.item_emb = nn.Embedding(n_items, emb_dim)
        self.item_bias = nn.Parameter(torch.zeros(n_items))
        nn.init.normal_(self.user_emb.weight, 0, 0.1)
        nn.init.normal_(self.item_emb.weight, 0, 0.1)
        self.max_score = max_score

    def forward(self, users, items):
        users_emb = self.user_emb(users)
        items_emb = self.item_emb(items)
        user_bias = self.user_bias[users]
        item_bias = self.item_bias[items]
        return self.max_score * torch.sigmoid(user_bias + item_bias + (items_emb * users_emb).sum(1))

In [None]:
import random
from torch.utils.data import DataLoader


def mse(scores, pred):
    return (scores - pred).pow(2).mean() + alpha * (model.user_emb.weight.pow(2).sum() + model.item_emb.weight.pow(2).sum())


def get_param_squared_norms(model, include_bias: bool = False):
    emb_norms = model.user_emb.weight.pow(2).sum() + model.item_emb.weight.pow(2).sum()
    # XXX problem: logistic output needs high bias norms
    bias_norms = (
        (model.user_bias - model.user_bias.mean()).pow(2).sum()
        + (model.item_bias - model.item_bias.mean()).pow(2).sum()
    )
    if include_bias:
        return emb_norms + bias_norms
    return emb_norms


n_users = ratings.loc[:, 'userId'].max()
n_items = ratings.loc[:, 'movieId'].max()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = KMF(n_users, n_items, 100, 5)
model.to(device)

dataset = [
    (torch.tensor([value, row - 1, col - 1]).float().to(device))
    for value, row, col in zip(values, rows, cols)
]

random.shuffle(dataset)

split_point = int(0.8 * len(dataset))
train_dataset = dataset[:split_point]
test_dataset = dataset[split_point:]

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
alpha = 0.02

for epoch in range(20):
    mse_acc = l2_acc = steps = 0
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        scores = batch[:, 0]
        users = batch[:, 1].long()
        items = batch[:, 2].long()
        pred = model(users, items)
        mse_loss = mse(scores, pred)
        l2_loss = alpha * get_param_squared_norms(model)
        mse_acc += float(mse_loss)
        l2_acc += float(l2_loss)
        loss = mse_loss + l2_loss
        loss.backward()
        steps += 1
        optimizer.step()
        if i and i % 50 == 0:
            mse_acc /= steps
            l2_acc /= steps
            print(f"MSE = {mse_acc:.3f} | L2 reg = {l2_acc:.2f}")
            mse_acc = l2_acc = steps = 0
    test_loss = test_steps = 0
    with torch.no_grad():
        for batch in test_loader:
            scores = batch[:, 0]
            users = batch[:, 1].long()
            items = batch[:, 2].long()
            pred = model(users, items)
            mse_loss = mse(scores, pred)
            test_loss += mse_loss
            test_steps += 1
    print(f"Test: MSE = {test_loss / test_steps:.3f}")

Next step: implement steps to add/update users and check if it's too slow