In [1]:
import fastai
from fastai import learner
from fastai.losses import MSELossFlat
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from fastai.data.core import DataLoaders
import numpy as np
from sklearn.model_selection import train_test_split

from typing import List

import sys

sys.path.append("/Users/marc/Documents/collab/")

from functions import PolarsCollabDataset

import polars as pl

In [2]:
df_ratings = pl.read_csv("../data/ml-latest-small/ratings.csv")

In [3]:
# Load metadata
df_movies = pl.read_csv("../data/ml-latest-small/movies.csv")

df_movies.head()

movieId,title,genres
i64,str,str
1,"""Toy Story (1995)""","""Adventure|Animation|Children|C…"
2,"""Jumanji (1995)""","""Adventure|Children|Fantasy"""
3,"""Grumpier Old Men (1995)""","""Comedy|Romance"""
4,"""Waiting to Exhale (1995)""","""Comedy|Drama|Romance"""
5,"""Father of the Bride Part II (1…","""Comedy"""


In [None]:
df_ratings = df_ratings.join(df_movies, on="movieId", how="inner")

In [5]:
df_ratings.head()

userId,movieId,rating,timestamp,title,genres
i64,i64,f64,i64,str,str
1,1,4.0,964982703,"""Toy Story (1995)""","""Adventure|Animation|Children|C…"
1,3,4.0,964981247,"""Grumpier Old Men (1995)""","""Comedy|Romance"""
1,6,4.0,964982224,"""Heat (1995)""","""Action|Crime|Thriller"""
1,47,5.0,964983815,"""Seven (a.k.a. Se7en) (1995)""","""Mystery|Thriller"""
1,50,5.0,964982931,"""Usual Suspects, The (1995)""","""Crime|Mystery|Thriller"""


In [6]:
# Create ordered ids for the users and movies
userId = df_ratings["userId"].unique().to_numpy()
movieId = df_ratings["movieId"].unique().to_numpy()

userId2userChId = {j:i  for i, j in enumerate(userId)}
movieId2movieChId = {j:i for i, j in enumerate(movieId)}

userChId2userId = {v: k for k, v in userId2userChId.items()}
movieChId2movieId = {v: k for k, v in movieId2movieChId.items()}

q = [
    pl.col("userId").replace(userId2userChId).alias("userChId"),
    pl.col("movieId").replace(movieId2movieChId).alias("movieChId"),
]

df_ratings = df_ratings.with_columns(q)

df_ratings.head()

userId,movieId,rating,timestamp,title,genres,userChId,movieChId
i64,i64,f64,i64,str,str,i64,i64
1,1,4.0,964982703,"""Toy Story (1995)""","""Adventure|Animation|Children|C…",0,0
1,3,4.0,964981247,"""Grumpier Old Men (1995)""","""Comedy|Romance""",0,2
1,6,4.0,964982224,"""Heat (1995)""","""Action|Crime|Thriller""",0,5
1,47,5.0,964983815,"""Seven (a.k.a. Se7en) (1995)""","""Mystery|Thriller""",0,43
1,50,5.0,964982931,"""Usual Suspects, The (1995)""","""Crime|Mystery|Thriller""",0,46


In [7]:
df_ratings.select(pl.col(["userId", "movieId", "userChId", "movieChId"]).max())

userId,movieId,userChId,movieChId
i64,i64,i64,i64
610,193609,609,9723


In [8]:
df_train, df_valid = train_test_split(df_ratings, test_size=0.3)

ds_train = PolarsCollabDataset(df_train, label_column="rating", user_column="userChId", item_column="movieChId")
ds_valid = PolarsCollabDataset(df_valid, label_column="rating", user_column="userChId", item_column="movieChId")

batch_size = 32
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_valid = DataLoader(ds_valid, batch_size=batch_size, shuffle=True)

In [9]:
# Sigmoid activation with limiter
x = 8
max_x = 5
min_x = 0

output = torch.sigmoid(torch.tensor([-3])) * (max_x - min_x) + min_x

class BoundedSigmoid(nn.Module):
    def __init__(self, min_x: float, max_x: float):
        super().__init__()
        self.min_x = min_x
        self.max_x = max_x

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.sigmoid(x) * (self.max_x - self.min_x) + self.min_x

In [10]:
class CollabNN(nn.Module):
    def __init__(
        self,
        n_users: int,
        n_items: int,
        min_outcome: float,
        max_outcome: float,
        n_embeddings: int = 64,
    ):
        super().__init__()

        self.user_embeddings = nn.Embedding(n_users + 1, n_embeddings)
        self.item_embeddings = nn.Embedding(n_items, n_embeddings)

        self.fc_layers = nn.Sequential(
            nn.Linear(n_embeddings * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            BoundedSigmoid(min_outcome, max_outcome),
        )

    def forward(self, user_ids, item_ids):
        user_embeds = self.user_embeddings(user_ids)
        item_embeds = self.item_embeddings(item_ids)

        # That's fine, because we pick n from users and n form items (repeating)
        x = torch.cat([user_embeds, item_embeds], dim=1)

        return self.fc_layers(x)

In [11]:
collab_nn = CollabNN(10, 5, 0, 5)

collab_nn(torch.tensor([1, 2]), torch.tensor([1, 1]))

tensor([[2.6639],
        [2.6590]], grad_fn=<AddBackward0>)

In [12]:
def train_model(model, train_loader, valid_loader, optimizer, epochs=10, patience=3, min_delta=1e-3):
    criterion = nn.MSELoss()

    best_valid_loss = float("inf")
    patience_counter = 0
    best_model_state = None

    train_losses = []
    valid_losses = []

    for epoch in range(epochs):
        # Training phase
        model.train()
        epoch_train_loss = 0.0
        train_batches = 0

        for batch_user_ids, batch_item_ids, batch_ratings in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_user_ids, batch_item_ids)
            loss = criterion(predictions, batch_ratings[:, None])
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss.item()
            train_batches += 1

        avg_train_loss = epoch_train_loss / train_batches
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        epoch_valid_loss = 0.0
        valid_batches = 0

        with torch.no_grad():
            for batch_user_ids, batch_item_ids, batch_ratings in valid_loader:
                predictions = model(batch_user_ids, batch_item_ids)
                loss = criterion(predictions, batch_ratings)

                epoch_valid_loss += loss.item()
                valid_batches += 1

        avg_valid_loss = epoch_valid_loss / valid_batches
        valid_losses.append(avg_valid_loss)

        # Print progress
        print(f'Epoch [{epoch+1}/{epochs}]')
        print(f'Training Loss: {avg_train_loss:.4f}')
        print(f'Validation Loss: {avg_valid_loss:.4f}')

        # Early stopping on epoch
        if avg_valid_loss <= (best_valid_loss - min_delta):
            best_valid_loss = avg_valid_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()

        else:
            patience_counter += 1

        if patience_counter >= patience:
            model.load_state_dict(best_model_state)
            break

    return {
        "train_losses": train_losses,
        "valid_losses": valid_losses,
        "best_valid_loss": best_valid_loss,
        "stopped_epoch": epoch + 1
    }

In [13]:
def finetune_user(model, user_id: int, item_ids: List[int], item_ratings: List[float], learning_rate: float = 0.01, epochs: int = 100):
    criterion = nn.MSELoss()

    # I left the last user embedding of the model free, so can user user_embedding - 1
    for param in model.parameters():
        param.require_grad = False

    # Keep item embeddings constant
    model.user_embeddings.weight.requires_grad = True

    optimizer = torch.optim.Adam([model.user_embeddings.weight], lr=learning_rate)

    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(
            torch.tensor([user_id] * len(item_ids)),
            torch.tensor(item_ids)
        )
        loss = criterion(predictions, torch.tensor(item_ratings).float()[:, None])
        loss.backward()
        optimizer.step()

In [51]:
def get_recommendations(model, user_id, item_ids, top_k=10):
    model.eval()
    with torch.no_grad():
        user_ids = torch.tensor([user_id] * len(item_ids))
        item_ids = torch.tensor(item_ids)
        predictions = model(user_ids, item_ids).flatten()

        # Get top-k recommendations
        top_k_values, top_k_indices = torch.topk(predictions, k=top_k)
        recommended_items = item_ids[top_k_indices]

    return recommended_items, top_k_values

In [52]:
df_ratings.head()

userId,movieId,rating,timestamp,title,genres,userChId,movieChId
i64,i64,f64,i64,str,str,i64,i64
1,1,4.0,964982703,"""Toy Story (1995)""","""Adventure|Animation|Children|C…",0,0
1,3,4.0,964981247,"""Grumpier Old Men (1995)""","""Comedy|Romance""",0,2
1,6,4.0,964982224,"""Heat (1995)""","""Action|Crime|Thriller""",0,5
1,47,5.0,964983815,"""Seven (a.k.a. Se7en) (1995)""","""Mystery|Thriller""",0,43
1,50,5.0,964982931,"""Usual Suspects, The (1995)""","""Crime|Mystery|Thriller""",0,46


In [53]:
df_ratings.select(pl.col(["userChId", "movieChId"]).n_unique())

userChId,movieChId
u32,u32
610,9724


In [54]:
collab_nn = CollabNN(610, 9724, 0.0, 5.0)
learning_rate = 1e-4
optimizer = torch.optim.Adam(collab_nn.parameters())

In [55]:
training_output = train_model(collab_nn, dl_train, dl_valid, optimizer, epochs=10, patience=2)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10]
Training Loss: 0.9495
Validation Loss: 1.2921
Epoch [2/10]
Training Loss: 0.8237
Validation Loss: 1.3340
Epoch [3/10]
Training Loss: 0.7568
Validation Loss: 1.3903


In [56]:
# The Godfather: 4.0
# The Dark Knight: 4.0
# Pulp Fiction: 3.5
# Inception: 3.0
# Interstellar: 4.5
# Se7en: 4.5
# The Prestige: 4.0

In [57]:
df_selection = df_movies.filter(pl.col("title").str.contains("Godfather|Dark Knight|Pulp Fiction|Inception|Interstellar|Se7en|The Prestige"))

df_selection.head()

movieId,title,genres
i64,str,str
47,"""Seven (a.k.a. Se7en) (1995)""","""Mystery|Thriller"""
296,"""Pulp Fiction (1994)""","""Comedy|Crime|Drama|Thriller"""
858,"""Godfather, The (1972)""","""Crime|Drama"""
1221,"""Godfather: Part II, The (1974)""","""Crime|Drama"""
2023,"""Godfather: Part III, The (1990…","""Crime|Drama|Mystery|Thriller"""


In [58]:
collab_nn.user_embeddings

Embedding(611, 64)

In [59]:
movie_ids = df_selection["movieId"].to_numpy().tolist()

item_ids = list(map(movieId2movieChId.get, movie_ids))[:3]
item_ids

[43, 257, 659]

In [60]:
user_id = 610
item_ratings = [4.0, 3.0, 4.0]

In [61]:
finetune_user(collab_nn, user_id, item_ids, item_ratings, learning_rate=1e-3, epochs=100)

In [62]:
movie_ids = df_movies["movieId"].unique().to_numpy()

In [63]:
all_item_ids = list(map(movieId2movieChId.get, movie_ids))
all_item_ids = [item_id for item_id in all_item_ids if item_id is not None]

In [73]:
recs_chid, scores = get_recommendations(collab_nn, user_id, all_item_ids, top_k=10)

recs_chid.numpy(), scores.numpy()

(array([ 224,  602, 3254,  704, 2717, 7638,  585,  277, 6631, 4075]),
 array([4.128145 , 4.0935063, 4.0793743, 4.0777736, 4.077164 , 4.05412  ,
        4.016326 , 4.0107784, 3.9975376, 3.9751916], dtype=float32))

In [74]:
id_recs = list(map(movieChId2movieId.get, recs_chid.numpy()))

df_movies.filter(pl.col("movieId").is_in(id_recs))

movieId,title,genres
i64,str,str
260,"""Star Wars: Episode IV - A New …","""Action|Adventure|Sci-Fi"""
318,"""Shawshank Redemption, The (199…","""Crime|Drama"""
720,"""Wallace & Gromit: The Best of …","""Adventure|Animation|Comedy"""
750,"""Dr. Strangelove or: How I Lear…","""Comedy|War"""
922,"""Sunset Blvd. (a.k.a. Sunset Bo…","""Drama|Film-Noir|Romance"""
3653,"""Endless Summer, The (1966)""","""Documentary"""
4406,"""Man Who Shot Liberty Valance, …","""Crime|Drama|Western"""
5828,"""Blackrock (1997)""","""Drama|Thriller"""
56782,"""There Will Be Blood (2007)""","""Drama|Western"""
88448,"""Paper Birds (Pájaros de papel)…","""Comedy|Drama"""
