In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Overview
Learning Matrix Factorization. Time spend: ~8hrs

Run Enviornment: Google Colab, GPU T4 & CPU

Credit: [github - oniani](https://github.com/oniani/ai/blob/main/model/dl/gmf.py)

Dataset: [MovieLens ml-latest-small](https://grouplens.org/datasets/movielens/)

## Matrix Factorization

In [None]:
import torch
import torch.nn as nn

class GMF(nn.Module):
  '''Generalized Matrix Factorization model'''
  def __init__(self, num_users: int, num_items: int, embedding_dim: int) -> None:
    '''Initializes model parameters'''
    super().__init__()
    self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
    self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_dim)

    # NOTE: We uniformly initialize the embeddings for equal contributions of intent
    self.user_embedding.weight.data.uniform_(0.5, 1.0)
    self.item_embedding.weight.data.uniform_(0.5, 1.0)

    self.affine_transform = nn.Linear(in_features=embedding_dim, out_features=1)

  def forward(self, users: torch.Tensor, items: torch.Tensor) -> torch.Tensor:
    '''Performs a forward pass.'''
    user_embeddings = self.user_embedding(users)
    item_embeddings = self.item_embedding(items)

    out = self.affine_transform(user_embeddings * item_embeddings)
    return out

In [None]:
# Training
import pandas as pd
import torch.optim as optim

# DATA: wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
df = pd.read_csv("/content/drive/MyDrive/ml-projects/recommender-system/datasets/MovieLens-ml-latest-small/ratings.csv")
df.drop('timestamp', inplace=True, axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [None]:
# Normalize ratings
rating, min_rating, max_rating = df['rating'], df['rating'].min(), df['rating'].max()
df['rating'] = (rating - min_rating) / (max_rating - min_rating)
print(f"rating is from {df['rating'].min()} to {df['rating'].max()}")

# Do not recommend if the rating is less than 0.5
condition = df['rating'] < 0.5
# Careful of the where() statement doing the opposite
df['rating'] = df['rating'].where(condition, 1)
df['rating'] = df['rating'].where(~condition, 0)

enc_movie = {movie_id: idx for idx, movie_id in enumerate(df['movieId'].unique())}
df['movieId'] = [enc_movie[movie_id] for movie_id in df['movieId']]
print(f"movieId is from {df['movieId'].min()} to {df['movieId'].max()}")

enc_user = {user_id: idx for idx, user_id in enumerate(df['userId'].unique())}
df['userId'] = [enc_user[user_id] for user_id in df['userId']]
print(f"userId is from {df['userId'].min()} to {df['userId'].max()}")

rating is from 0.0 to 1.0
movieId is from 0 to 9723
userId is from 0 to 609


In [None]:
# PyTorch dataset
class MovieLensSmall(torch.utils.data.Dataset):
  def __init__(self, df: pd.DataFrame) -> None:
    self.df = df

  def __len__(self) -> int:
    return len(self.df)

  def __getitem__(self, idx: int):
    return list(df.iloc[idx])

train_dataloader = torch.utils.data.DataLoader(
    MovieLensSmall(df),
    batch_size=4,
    shuffle=True,
    num_workers=2 # or 8
)

model = GMF(num_users=len(enc_user), num_items=len(enc_movie), embedding_dim=10)
model = nn.DataParallel(model)
model.train()

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-3
)

In [None]:
# Training loop
log_idx = 1_000
for epoch in range(5):
  running_loss = 0.0
  for idx, (users, items, ratings) in enumerate(train_dataloader):
    # move users, items, and ratings onto the device
    users = users.long() # users.cuda().long()
    items = items.long() # items.cuda().long()
    ratings = ratings # ratings.cuda()

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = model(users, items).reshape(-1)

    loss = criterion(outputs, ratings)
    loss.backward()
    optimizer.step()

    # accumulate loss and log
    running_loss += loss.item()
    if idx % log_idx == log_idx - 1:
      print(f"Epoch {epoch} | Steps: {idx + 1:<4} | Loss: {running_loss / log_idx:.3f}")
      running_loss = 0.0


## Preprocessing

Credit: [Kaggle - MovieLens Preprocessing](https://www.kaggle.com/code/colinmorris/movielens-preprocessing)

Credit: [Kaggle - Embedding Layers](https://www.kaggle.com/code/colinmorris/embedding-layers#Training-it)

Credit: [Kaggle - Matrix Factorization](https://www.kaggle.com/code/colinmorris/matrix-factorization/notebook)