# pytorch CF tutorial

https://www.youtube.com/watch?v=Wj-nkk7dFS8

resources: https://drive.google.com/drive/folders/1B_NZl0GeDrdx67dd_7mymYRF3Ktkdrd0

In [26]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

from fmh_notes.datasets import MovieLensBuilder, MovieLens, MovieLensDataset
from fmh_notes import utils

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
device = utils.get_pytorch_device()
device

'mps'

# get movielens data

In [30]:
movielens: MovieLens = MovieLensBuilder(datadir="./data").download().build()

MovieLens already downloaded


In [33]:
movielens.train_dataset[0], movielens.val_dataset[0]

({'users': tensor(134), 'movies': tensor(622), 'ratings': tensor(4)},
 {'users': tensor(428), 'movies': tensor(221), 'ratings': tensor(4)})

In [35]:
train_loader = DataLoader(
    dataset=movielens.train_dataset, batch_size=4, shuffle=True, num_workers=0
)

In [34]:
validation_loader = DataLoader(
    dataset=movielens.val_dataset, batch_size=4, shuffle=True, num_workers=0
)

In [41]:
i, data = next(enumerate(train_loader))
data

{'users': tensor([599, 108, 287, 219]),
 'movies': tensor([ 607,  134, 2185, 6753]),
 'ratings': tensor([0, 3, 3, 3])}

In [49]:
data["ratings"], data["ratings"].reshape(4, -1)

(tensor([0, 3, 3, 3]),
 tensor([[0],
         [3],
         [3],
         [3]]))

# train

In [22]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()

        # trainable lookup matrix for shallow embedding vectors
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)

        # user, movie embedding concat
        self.out = nn.Linear(64, 1)

    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds, movie_embeds], dim=1)

        output = self.out(output)

        return output

In [37]:
model = RecSysModel(
    n_users=movielens.n_users,
    n_movies=movielens.n_movies,
).to(device)

optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

loss_func = nn.MSELoss()

In [38]:
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = []

model.train()
for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):
        output = model(train_data["users"].to(device), train_data["movies"].to(device))

        # .view(4, -1) is to reshape the rating to match the shape of model output which is 4x1
        rating = train_data["ratings"].view(4, -1).to(torch.float32).to(device)

        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt = step_cnt + len(train_data["users"])

        if step_cnt % plot_steps == 0:
            avg_loss = total_loss / (len(train_data["users"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0  # reset total_loss

epoch 0 loss at step: 5000 is 0.5172129237443208
epoch 0 loss at step: 10000 is 0.19656794695239513
epoch 0 loss at step: 15000 is 0.10866193754337727
epoch 0 loss at step: 20000 is 0.08510973144900054
epoch 0 loss at step: 25000 is 0.07516497840899974
epoch 0 loss at step: 30000 is 0.06739943576809018
epoch 0 loss at step: 35000 is 0.06381229525641538
epoch 0 loss at step: 40000 is 0.06190685783848166
epoch 0 loss at step: 45000 is 0.06257421865025535
epoch 0 loss at step: 50000 is 0.060862274226453156
epoch 0 loss at step: 55000 is 0.062382963575562465
epoch 0 loss at step: 60000 is 0.05921544357277453
epoch 0 loss at step: 65000 is 0.058380976187996564
epoch 0 loss at step: 70000 is 0.05764615434869193
epoch 0 loss at step: 75000 is 0.05931314347162843
epoch 0 loss at step: 80000 is 0.05800028632543981
epoch 0 loss at step: 85000 is 0.05579527057604864
epoch 0 loss at step: 90000 is 0.05672175200143829


In [33]:
rating

tensor([[1.],
        [4.],
        [4.],
        [1.]], device='mps:0')

In [35]:
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader):
        u = batched_data["users"].to(device)
        m = batched_data["movies"].to(device)
        r = batched_data["ratings"].to(device)
        model_output = model(u, m)

        model_output_list.append(model_output.sum().item() / len(u))

        # target_rating = batched_data["ratings"]

        target_rating_list.append(r.sum().item() / len(batched_data["users"]))

        # print(f"model_output: {model_output}, target_rating: {target_rating}")


# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")

  target_rating_list.append(r.sum().item() / len(batched_data['users']))


rms: 0.46929448444177374
