**Udemy 8-3. Recommender Systems with Deep Learning Code**

Unmodified version is:

- Slower

- Find a sub-optimal answer

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.utils import shuffle

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# data is from: https://grouplens.org/datasets/movielens/
# MovieLens 20M movie ratings. Stable benchmark dataset. 20 million ratings and 465,000 tag applications applied to 27,000 movies by 138,000 users. Includes tag genome data with 12 million relevance scores across 1,100 tags. Released 4/2015; updated 10/2016 to update links.csv and add tag genome data.
!wget -nc https://files.grouplens.org/datasets/movielens/ml-20m.zip

--2022-03-02 08:19:10--  https://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2022-03-02 08:19:12 (108 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [None]:
!unzip -n /content/ml-20m.zip
# -n : never overwrite existing files. If a file already exists, skip the extraction of that file without prompting.

Archive:  /content/ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [None]:
!ls

ml-20m	ml-20m.zip  sample_data


In [None]:
df = pd.read_csv("/content/ml-20m/ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [None]:
# We can't trust the userID and movieID to be numbered 0...N-1
# Let;s just set our own ids

df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes

df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [None]:
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values - 2.5

In [None]:
# Get the number of users and movies 
N = len(set(user_ids))
M = len(set(movie_ids))

In [None]:
# Set the embeding dimension
D = 10

In [None]:
# Make a Neural Network
class Model(nn.Module):
  def __init__(self, n_users, n_movies, embed_dim, n_hidden=1024):
    super().__init__()
    self.N = n_users
    self.M = n_movies
    self.D = embed_dim

    self.u_emb = nn.Embedding(num_embeddings=self.N, embedding_dim=self.D)
    self.m_emb = nn.Embedding(num_embeddings=self.M, embedding_dim=self.D)

    self.fc1 = nn.Linear(in_features=2*self.D, out_features=n_hidden)
    self.fc2 = nn.Linear(in_features=n_hidden, out_features=1)

  
  def forward(self, u, m):
    u = self.u_emb(u) # the output is (number_of_samples, D)
    m = self.m_emb(m) # the output is (number_of_samples, D)

    # merge
    out = torch.cat((u,m), dim=1) # the output is (number_of_samples, 2D)

    out = self.fc1(out)
    out = F.relu(out)
    out = self.fc2(out)
    return out


In [None]:
model = Model(n_users=N, n_movies=M, embed_dim=D)
model.to(device)

Model(
  (u_emb): Embedding(138493, 10)
  (m_emb): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)

In [None]:
# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# Shuffle the data in corresponding orders
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)

In [None]:
# Convert to tensors
user_ids_t = torch.from_numpy(user_ids).long()
movie_ids_t = torch.from_numpy(movie_ids).long()
ratings_t = torch.from_numpy(ratings).long()

In [None]:
# Make datasets
Ntrain = int(0.8 * len(ratings))

train_dataset = torch.utils.data.TensorDataset(
    user_ids_t[Ntrain:],
    movie_ids_t[Ntrain:],
    ratings_t[Ntrain:],
)

test_dataset = torch.utils.data.TensorDataset(
    user_ids_t[:Ntrain],
    movie_ids_t[:Ntrain],
    ratings_t[:Ntrain],
)

In [None]:
# Make dataloaders
batch_size = 512

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True
    )

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False
    )


In [None]:
# A function to encapsulate the training loop
def batch_gd(model, criterion, optimizer, train_iter, test_iter, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    t0 = datetime.now()
    train_loss = []

    for users, movies, targets in train_iter:
      targets = targets.view(-1,1).float()

      users, movies, targets = users.to(device), movies.to(device), targets.to(device)

      # zero the gradients
      optimizer.zero_grad()

      # Forward Pass
      outputs = model(users, movies)
      loss = criterion(outputs, targets)

      # backward pass
      loss.backward()
      optimizer.step()

      train_loss.append(loss.item())
    
    # Get the train loss and test loss
    train_loss[it] = np.mean(train_loss)
    
    test_loss = []
    for users, movies, targets in test_iter:
      targets = targets.view(-1,1).float()
      users, movies, targets = users.to(device), movies.to(device), targets.to(device)
      outputs = model(users, movies)
      loss = criterion(outputs, targets)
      test_loss.append(loss.item())

    test_losses[it] = np.mean(test_loss)
    t1 = datetime.now()
    print(f"Epoch: {it+1}/{epochs}, Train Loss: {train_loss[it]:.4f}, Test Loss: {test_loss[it]:.4f}, Duration: {t1-t0}"")
  return train_losses, test_losses


In [None]:
#train_losses, test_losses = batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs=25)

In [None]:
# profile this using
%prun train_losses, test_losses = batch_gd(model, criterion, optimizer, train_loader, test_loader, epochs=3)

Epoch: 1/3, Train Loss: 0.6774, Test Loss: 0.6386
Epoch: 2/3, Train Loss: 0.5898, Test Loss: 0.5271
Epoch: 3/3, Train Loss: 0.5584, Test Loss: 0.6300
 

In [None]:
plt.plot(train_losses, label="train loss")
plt.plot(test_losses, label="test loss")
plt.legend()
plt.show()