In [1]:
! pip install -Uqq fastbook
from fastbook import *

In [2]:
path = untar_data(URLs.ML_100k)

In [3]:
path.ls()

(#23) [Path('/root/.fastai/data/ml-100k/u3.test'),Path('/root/.fastai/data/ml-100k/u4.test'),Path('/root/.fastai/data/ml-100k/u4.base'),Path('/root/.fastai/data/ml-100k/u.data'),Path('/root/.fastai/data/ml-100k/ub.test'),Path('/root/.fastai/data/ml-100k/allbut.pl'),Path('/root/.fastai/data/ml-100k/u2.base'),Path('/root/.fastai/data/ml-100k/u.genre'),Path('/root/.fastai/data/ml-100k/u1.base'),Path('/root/.fastai/data/ml-100k/mku.sh')...]

In [4]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                 names=['user', 'movie', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=['movie', 'title'], header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [7]:
from fastai.collab import *

dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,542,My Left Foot (1989),4
1,422,Event Horizon (1997),3
2,311,"African Queen, The (1951)",4
3,595,Face/Off (1997),4
4,617,Evil Dead II (1987),1
5,158,Jurassic Park (1993),5
6,836,Chasing Amy (1997),3
7,474,Emma (1996),3
8,466,Jackie Chan's First Strike (1996),3
9,554,Scream (1996),3


In [8]:
u_factors = torch.nn.Embedding(25, 5)
i_factors = torch.nn.Embedding(15, 5)

idx = torch.tensor([[1,2], [4, 3]])

In [9]:
u_emb, i_emb = u_factors(idx[:, 0]), i_factors(idx[:, 1])
u_emb, i_emb

(tensor([[-0.5195,  0.7613, -0.4365,  0.1365,  1.3300],
         [-0.1345, -1.5280,  0.0200, -0.1748, -0.1233]], grad_fn=<EmbeddingBackward0>),
 tensor([[-1.0512,  1.1215,  1.1948, -0.0144,  0.7229],
         [ 0.7811, -0.8685, -0.0818, -0.6450, -1.0731]], grad_fn=<EmbeddingBackward0>))

In [10]:
(u_emb * i_emb).sum(dim=1)

tensor([1.8378, 1.4654], grad_fn=<SumBackward1>)

In [11]:
class Collab(torch.nn.Module):
  def __init__(self, num_users, num_items, num_factors):
    super().__init__()
    self.user_factors = torch.nn.Embedding(num_users, num_factors)
    self.item_factors = torch.nn.Embedding(num_items, num_factors)
  
  def forward(self, x):
    users = self.user_factors(x[:, 0])
    items = self.item_factors(x[:, 1])
    return (users*items).sum(dim=1)

In [12]:
x, y = dls.one_batch()
x.shape, y.shape

(torch.Size([64, 2]), torch.Size([64, 1]))

In [13]:
x[:5], y[:5]

(tensor([[ 661,  659],
         [ 861,  730],
         [ 766,    8],
         [  13, 1153],
         [ 429, 1185]]),
 tensor([[4],
         [4],
         [4],
         [1],
         [3]], dtype=torch.int8))

In [14]:
num_users = len(dls.classes['user'])
num_movies = len(dls.classes['title'])

In [15]:
model = Collab(num_users, num_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [16]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,47.097916,46.625027,00:08
1,22.784897,25.091343,00:09
2,8.122038,12.430268,00:08
3,3.698257,8.743163,00:08
4,2.690001,8.311974,00:08


In [17]:
class Collab(torch.nn.Module):
  def __init__(self, num_users, num_items, num_factors, y_range=(0, 5.5)):
    super().__init__()
    self.user_factors = torch.nn.Embedding(num_users, num_factors)
    self.item_factors = torch.nn.Embedding(num_items, num_factors)
    self.y_range = y_range
    
  def forward(self, x):
    users = self.user_factors(x[:, 0])
    items = self.item_factors(x[:, 1])
    return sigmoid_range((users * items).sum(dim=1), *self.y_range)
    

In [18]:
model = Collab(num_users, num_movies,50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [19]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,7.621876,7.59193,00:09
1,6.635986,7.230602,00:09
2,5.34433,6.808065,00:08
3,4.486285,6.566131,00:08
4,4.125021,6.522335,00:08


In [20]:
class CollabBias(torch.nn.Module):
  def __init__(self, num_users, num_items, num_factors, y_range=(0, 5.5)):
    super().__init__()
    self.user_factors = Embedding(num_users, num_factors)
    self.user_bias = Embedding(num_users, 1)
    self.item_factors = Embedding(num_items, num_factors)
    self.item_bias = Embedding(num_items, 1)
    self.y_range = y_range
    
  def forward(self, x):
    users = self.user_factors(x[:, 0])
    items = self.item_factors(x[:, 1])
    res = (users * items).sum(dim=1, keepdim=True)
    res += self.user_bias(x[:, 0]) + self.item_bias(x[:, 1])
    return sigmoid_range(res, *self.y_range)

In [21]:
model = CollabBias(num_users, num_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [31]:
learn.fit_one_cycle(5, 5e-3, wd=.1)

epoch,train_loss,valid_loss,time
0,0.389793,0.891906,00:09
1,0.518963,0.883271,00:10
2,0.444614,0.878117,00:09
3,0.330777,0.878838,00:09
4,0.235888,0.879659,00:09


In [32]:
movie_bias = learn.model.item_bias.weight.squeeze()
movie_bias

tensor([-1.9823e-05, -1.1537e-01,  4.0276e-02,  ..., -3.1001e-02,  5.2063e-02,  2.5484e-02], device='cuda:0', grad_fn=<SqueezeBackward0>)

In [33]:
idx = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idx]

['Children of the Corn: The Gathering (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Crow: City of Angels, The (1996)',
 'Bio-Dome (1996)',
 'Robocop 3 (1993)']

In [34]:
idx = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idx]

['Shawshank Redemption, The (1994)',
 "Schindler's List (1993)",
 'Titanic (1997)',
 'Wrong Trousers, The (1993)',
 'Rear Window (1954)']

In [35]:
class CollabNN(torch.nn.Module):
  def __init__(self, user_sz, item_sz, y_range=(0, 5.5), n_acts=100):
    super().__init__()
    self.user_factors = torch.nn.Embedding(*user_sz)
    self.item_factors = torch.nn.Embedding(*item_sz)
    self.layers = torch.nn.Sequential(
      torch.nn.Linear(user_sz[1]+ item_sz[1], n_acts),
      torch.nn.ReLU(),
      torch.nn.Linear(n_acts, 1)
    )
    self.y_range = y_range
    
  def forward(self, x):
    embs = self.user_factors(x[:, 0]), self.item_factors(x[:, 1])
    x = self.layers(torch.cat(embs, dim=1))
    return sigmoid_range(x, *self.y_range)

In [36]:
from fastai.collab import get_emb_sz, collab_learner

In [37]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [38]:
model = CollabNN(*embs)

In [39]:
learner = Learner(dls, model, loss_func=MSELossFlat())
learner.fit_one_cycle(5, 5e-3, wd=.1)

epoch,train_loss,valid_loss,time
0,0.989208,1.010429,00:11
1,0.891999,0.923363,00:10
2,0.860893,0.893432,00:11
3,0.809145,0.871637,00:10
4,0.718665,0.868822,00:11
