In [1]:
from fastbook import untar_data, URLs, Learner, MSELossFlat
import pandas as pd
import torch
import torch.nn.functional as F

### Title

Smoe text here.

In [2]:
path = untar_data(URLs.ML_100k)

path.ls()

(#23) [Path('/Users/modsoussi/.fastai/data/ml-100k/u.item'),Path('/Users/modsoussi/.fastai/data/ml-100k/u3.test'),Path('/Users/modsoussi/.fastai/data/ml-100k/u1.base'),Path('/Users/modsoussi/.fastai/data/ml-100k/u.info'),Path('/Users/modsoussi/.fastai/data/ml-100k/u2.test'),Path('/Users/modsoussi/.fastai/data/ml-100k/u5.test'),Path('/Users/modsoussi/.fastai/data/ml-100k/u.genre'),Path('/Users/modsoussi/.fastai/data/ml-100k/ub.test'),Path('/Users/modsoussi/.fastai/data/ml-100k/ua.base'),Path('/Users/modsoussi/.fastai/data/ml-100k/u.data')...]

In [3]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user', 'movie', 'rating', 'timestamp'])

ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movies = pd.read_csv(path/'u.item', delimiter='|', header=None, names=['movie', 'title'], encoding='latin-1', usecols=(0,1))

movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
ratings = ratings[['user', 'title', 'rating']]
ratings.head()

Unnamed: 0,user,title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5
3,154,Kolya (1996),3
4,306,Kolya (1996),5


In [7]:
movie_mapping = {title: idx for idx,title in enumerate(ratings['title'].unique())}
movie_mapping

{'Kolya (1996)': 0,
 'L.A. Confidential (1997)': 1,
 'Heavyweights (1994)': 2,
 'Legends of the Fall (1994)': 3,
 'Jackie Brown (1997)': 4,
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)': 5,
 'Hunt for Red October, The (1990)': 6,
 'Jungle Book, The (1994)': 7,
 'Grease (1978)': 8,
 'Remains of the Day, The (1993)': 9,
 'Men in Black (1997)': 10,
 "Romy and Michele's High School Reunion (1997)": 11,
 'Star Trek: First Contact (1996)': 12,
 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)': 13,
 'Batman Forever (1995)': 14,
 'Only You (1994)': 15,
 'Age of Innocence, The (1993)': 16,
 'Sabrina (1995)': 17,
 'Just Cause (1995)': 18,
 'Endless Summer 2, The (1994)': 19,
 'Man Without a Face, The (1993)': 20,
 'Sabrina (1954)': 21,
 'Die Hard (1988)': 22,
 'Twister (1996)': 23,
 'Toy Story (1995)': 24,
 'Broken Arrow (1996)': 25,
 'Aladdin (1992)': 26,
 'Casper (1995)': 27,
 'Restoration (1995)': 28,
 'Jaws (1975)': 29,
 'Chasing Amy (1997)': 30,
 'S

In [8]:
ratings['movie'] = ratings['title'].map(movie_mapping)
ratings.head()

Unnamed: 0,user,title,rating,movie
0,196,Kolya (1996),3,0
1,63,Kolya (1996),3,0
2,226,Kolya (1996),5,0
3,154,Kolya (1996),3,0
4,306,Kolya (1996),5,0


In [9]:
train_x = torch.tensor(ratings[['user', 'movie']].to_numpy(), dtype=int)
train_y = torch.tensor(ratings['rating'], dtype=torch.float32).reshape(100000, 1)

train_x[:5], train_y[:5]

(tensor([[196,   0],
         [ 63,   0],
         [226,   0],
         [154,   0],
         [306,   0]]),
 tensor([[3.],
         [3.],
         [5.],
         [3.],
         [5.]]))

In [10]:

import torch.utils.data as data

def train(model: torch.nn.Module, train_x: torch.Tensor, train_y: torch.Tensor, n_epochs=5, lr=.1, loss_fn=F.mse_loss, wd=0.0): 
  if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    model.to('mps')
    x = train_x.to('mps')
    y = train_y.to('mps')
  
  dataset = data.TensorDataset(x,y)
  
  train_size = round(.8 * len(x))
  valid_size = len(x) - train_size
  train_set, validation_set = data.random_split(dataset, [train_size, valid_size])
  
  t_loader = data.DataLoader(train_set, 64, True)
  v_loader = data.DataLoader(validation_set, 64, True)
  
  optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
  for i in range(n_epochs):
    model.train()
    t_loss = 0.0
    for xb, yb in t_loader:
      optimizer.zero_grad()
      preds = model(xb)
      loss = loss_fn(preds, yb)
      loss.backward()
      optimizer.step()
      t_loss += loss.item()
    
    t_loss /= len(t_loader)
    
    model.eval()
    v_loss = 0.0
    with torch.no_grad():
      for vbx, vby in v_loader:
        preds = model(vbx)
        loss = loss_fn(preds, vby)
        v_loss += loss.item()
      
    v_loss /= len(v_loader)
    
    print(f"t_loss: {t_loss} - v_loss: {v_loss}")
    
  model.cpu()

In [11]:
num_users = len(ratings['user'].unique()) + 1
num_movies = len(ratings['title'].unique()) + 1

num_users, num_movies

(944, 1665)

In [12]:
class Collab(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=50, y_range=(0, 5.5)) -> None:
    super().__init__()
    
    self.user_embs = torch.nn.Embedding(n_users, n_factors)
    torch.nn.init.normal_(self.user_embs.weight, 0, .1)
    self.user_bias = torch.nn.Embedding(n_users, 1)
    self.item_embs = torch.nn.Embedding(n_items, n_factors)
    torch.nn.init.normal_(self.item_embs.weight, 0, .1)
    self.item_bias = torch.nn.Embedding(n_items, 1)
    
    self.y_range = y_range
    
  def forward(self, x: torch.Tensor):
    u = self.user_embs(x[:, 0])
    u_bias = self.user_bias(x[:, 0])
    i = self.item_embs(x[:, 1])
    i_bias = self.item_bias(x[:, 1])
    
    interaction =  (u*i).sum(1, keepdim=True)
    interaction += u_bias + i_bias
    
    return F.sigmoid(interaction)*(self.y_range[1]-self.y_range[0]) + self.y_range[0]

In [61]:
model = Collab(num_users, num_movies, n_factors=75)

train(model, train_x, train_y, lr=5e-1)

t_loss: 1.8311157792568207 - v_loss: 1.1883699880621303
t_loss: 1.0026702100753784 - v_loss: 1.009565673697109
t_loss: 0.827119101524353 - v_loss: 0.9529589980174178
t_loss: 0.7006183949947358 - v_loss: 0.9295648237386831
t_loss: 0.5803426013231278 - v_loss: 0.9254319572601074


In [62]:
movie_bias = model.item_bias.weight.squeeze()
movie_bias

tensor([ 0.7325,  1.0048, -0.8229,  ...,  0.2921, -1.5642, -1.0243], grad_fn=<SqueezeBackward0>)

In [63]:
idx = movie_bias.argsort(descending=True)[:5].tolist()

idx

[1381, 1022, 1509, 1417, 1659]

In [64]:
movies_unique = ratings['title'].unique()

[movies_unique[i] for i in idx]

['Prisoner of the Mountains (Kavkazsky Plennik) (1996)',
 'Two Much (1996)',
 'Night Flier (1997)',
 'Celestial Clockwork (1994)',
 'Mamma Roma (1962)']

In [65]:
[movie_mapping[s] for s in [movies_unique[i] for i in idx]]

[1381, 1022, 1509, 1417, 1659]