# Matrix factorization on MovieLens dataset

## Download a small subset of MovieLens

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

## Fast SGD based method

In [None]:
import pandas as pd
import torch
import torch.nn as nn

from tqdm import tqdm

import plotly.express as px
px.defaults.template = 'plotly_dark'

from sklearn.model_selection import train_test_split

EPOCHS = 250
LR = 1e-2
LATENT_DIM = 2
LAMBDA = 1e-5
SEED = 1
torch.manual_seed(SEED)

# Load data
df = pd.read_csv('./ml-latest-small/ratings.csv')

unique_users = df.userId.unique()
unique_movies = df.movieId.unique()

# Create user-item matrix
r_ui = torch.zeros(len(unique_users), len(unique_movies)) * float('nan')
for i, j, r in zip(df.userId.factorize()[0], df.movieId.factorize()[0], df.rating):
    r_ui[i, j] = r

# Split data
rows, cols = r_ui.nan_to_num(0.0).to_sparse().indices()
idx = range(len(rows))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=SEED)

# Initialize parameters
W = nn.Parameter(torch.rand(len(unique_users), LATENT_DIM) / LATENT_DIM)
H = nn.Parameter(torch.rand(LATENT_DIM, len(unique_movies)) / LATENT_DIM)

l2 = nn.MSELoss()
optim = torch.optim.Adam([W, H], lr=LR)

losses = []
for it in tqdm(range(EPOCHS)):
    V = W @ H
    u = rows[train_idx]
    i = cols[train_idx]
    loss = l2(V[u, i], r_ui[u, i])
    loss_reg = loss + LAMBDA * W.norm(p=2) + LAMBDA * H.norm(p=2)
    optim.zero_grad()
    loss_reg.backward()
    optim.step()

    with torch.no_grad():
        V = W @ H
        u = rows[test_idx]
        i = cols[test_idx]
        loss_test = l2(V[u, i], r_ui[u, i])
    losses.append((loss.item(), loss_test.item()))

for idx in range(5):
    V = W @ H
    u = rows[test_idx[idx]]
    i = cols[test_idx[idx]]

    print(f'Predicted: {V[u, i].item()}')
    print(f'Actual: {r_ui[u, i].item()}')
    print(f'Difference: {abs(V[u, i].item() - r_ui[u, i].item())}\n')

px.line(losses).show()

## Explicit MF
[Explicit Matrix Factorization: ALS, SGD, and All That Jazz](https://blog.insightdatascience.com/explicit-matrix-factorization-als-sgd-and-all-that-jazz-b00e4d9b21ea)

In [None]:
import pandas as pd
import torch

import plotly.express as px
px.defaults.template = 'plotly_dark'

from sklearn.model_selection import train_test_split

EPOCHS = 50
BATCH_SIZE = 1000
LATENT_DIM = 2
LR = 0.01
LAMBDA = 2e-1

# Load data
df = pd.read_csv('./ml-latest-small/ratings.csv')
unique_users = df.userId.unique()
unique_movies = df.movieId.unique()

# Create user-item matrix
r_ui = torch.zeros(len(unique_users), len(unique_movies))
for i, j, r in zip(df.userId.factorize()[0], df.movieId.factorize()[0], df.rating):
    r_ui[i, j] = r

# Craft user and item indices
rows = df.userId.factorize()[0]
cols = df.movieId.factorize()[0]
idx = range(len(rows))
train_idx, test_idx = train_test_split(idx, test_size=0.2)

# Initialize latent factors
x_u = torch.rand(LATENT_DIM, r_ui.shape[0]) / LATENT_DIM
y_i = torch.rand(LATENT_DIM, r_ui.shape[1]) / LATENT_DIM

mean = df.rating.mean()
b_u = torch.from_numpy(df.groupby('userId').mean().rating.values)
b_i = torch.from_numpy(df.groupby('movieId').mean().rating.values)

errors = []
for it in range(EPOCHS):
    error = []

    for idx in range(0, len(train_idx), BATCH_SIZE):
        # User indices
        u = torch.from_numpy(rows[train_idx[idx:idx+BATCH_SIZE]])
        # Item indices
        i = torch.from_numpy(cols[train_idx[idx:idx+BATCH_SIZE]])

        # Predicted rating
        r_hat_ui = (mean + b_u[u] + b_i[i] + (x_u[:, u].T @ y_i[:, i])).float()

        e_ui = r_ui[u, i] - r_hat_ui

        # Update biases
        b_u[u] = b_u[u] + LR * (e_ui - LAMBDA * b_u[u, None]).mean(1)
        b_i[i] = b_i[i] + LR * (e_ui - LAMBDA * b_i[None, i]).mean(0)

        # Update latent factors
        x_u[:, u] = x_u[:, u] + LR * (e_ui[None] * y_i[:, None, i] - LAMBDA * x_u[:, u, None]).mean(2)
        y_i[:, i] = y_i[:, i] + LR * (e_ui[None] * x_u[:, u, None] - LAMBDA * y_i[:, None, i]).mean(1)

        mse = e_ui.square().mean()
        error.append(mse)

    mse = sum(error).item() / len(error)  

    error = []
    for idx in range(0, len(test_idx), BATCH_SIZE):
        u = torch.from_numpy(rows[test_idx[idx:idx+BATCH_SIZE]])
        i = torch.from_numpy(cols[test_idx[idx:idx+BATCH_SIZE]])

        r_hat_ui = (mean + b_u[u] + b_i[i] + (x_u[:, u].T @ y_i[:, i])).float()
        e_ui = r_ui[u, i] - r_hat_ui

        mse_test = e_ui.square().mean()
        error.append(mse_test)

    mse_test = sum(error).item() / len(error)  

    errors.append((mse, mse_test))
    print(f'{mse = } {mse_test = }')