In [1]:
import fastbook
fastbook.setup_book()

In [2]:
from fastbook import *

### Collaborative Filtering Deep Dive

Collaborative filtering usually calls thing *items* rather than *products*

Key foundational idea is *latent factors* 

#### A First Look at the Data

We're going to use a dataset called [MovieLens](https://grouplens.org/datasets/movielens/)

- Contains 10s of millions of rankings(a combo of a movie id, user id, and numeric rating)

- Just going to use on a subset of 100k for this

- Good practice to go back and try this on the full set at some point

In [3]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [4]:
path.ls()

(#23) [Path('/home/petewin/.fastai/data/ml-100k/u2.base'),Path('/home/petewin/.fastai/data/ml-100k/u.genre'),Path('/home/petewin/.fastai/data/ml-100k/u.user'),Path('/home/petewin/.fastai/data/ml-100k/u3.test'),Path('/home/petewin/.fastai/data/ml-100k/ub.test'),Path('/home/petewin/.fastai/data/ml-100k/u.occupation'),Path('/home/petewin/.fastai/data/ml-100k/ua.test'),Path('/home/petewin/.fastai/data/ml-100k/u5.base'),Path('/home/petewin/.fastai/data/ml-100k/u4.test'),Path('/home/petewin/.fastai/data/ml-100k/u.info')...]

In [5]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Above has everything we need, there's also cross-tabulated data that's more human-readable

In the cross-tab format we can see that users have rated some movies, not others, we're going to try to predict how much they would enjoy a given movie that they have not yet rated

In [6]:
last_skywalker = np.array([0.98, 0.9, -0.9])

In [7]:
user1 = np.array([0.9, 0.8, -0.6])

In [8]:
(user1 * last_skywalker).sum()

2.1420000000000003

When multiplying two vectors together and adding up results, it's known as the *dot product*

In [9]:
casablanca = np.array([-0.99, -0.3, 0.8])

In [10]:
(user1 * casablanca).sum()

-1.611

In [11]:
user1 * casablanca

array([-0.891, -0.24 , -0.48 ])

In [12]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie', 'title'), header=None)
movies.head()                     

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Merge with ratings table

In [13]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [26]:
pb_ratings = pd.concat([ratings.iloc[5:10], ratings.iloc[500:505]], axis=0)

In [27]:
pb_ratings

Unnamed: 0,user,movie,rating,timestamp,title
5,296,242,4,884196057,Kolya (1996)
6,34,242,5,888601628,Kolya (1996)
7,271,242,4,885844495,Kolya (1996)
8,201,242,4,884110598,Kolya (1996)
9,209,242,4,883589606,Kolya (1996)
500,320,51,5,884750992,Legends of the Fall (1994)
501,896,51,2,887159951,Legends of the Fall (1994)
502,378,51,3,880333195,Legends of the Fall (1994)
503,551,51,5,892784780,Legends of the Fall (1994)
504,632,51,4,879459166,Legends of the Fall (1994)


In [28]:
pb_ratings = ratings.sample(n=10)

In [29]:
pb_ratings

Unnamed: 0,user,movie,rating,timestamp,title
75721,498,693,3,881957625,Casino (1995)
80184,642,542,5,885606609,Pocahontas (1995)
19864,58,135,4,884305150,2001: A Space Odyssey (1968)
76699,495,674,3,888635995,Cat People (1982)
92991,618,735,3,891308571,Philadelphia (1993)
76434,725,300,4,876106729,Air Force One (1997)
84004,794,473,4,891036222,James and the Giant Peach (1996)
80917,678,1129,1,879544915,Chungking Express (1994)
60767,43,161,4,883955467,Top Gun (1986)
50074,752,350,4,891208357,Fallen (1998)


In [15]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,542,My Left Foot (1989),4
1,422,Event Horizon (1997),3
2,311,"African Queen, The (1951)",4
3,595,Face/Off (1997),4
4,617,Evil Dead II (1987),1
5,158,Jurassic Park (1993),5
6,836,Chasing Amy (1997),3
7,474,Emma (1996),3
8,466,Jackie Chan's First Strike (1996),3
9,554,Scream (1996),3


In [35]:
pb_dls = CollabDataLoaders.from_df(pb_ratings, item_name='title')
pb_dls.items

Unnamed: 0,user,movie,rating,timestamp,title
50074,8,350,4,891208357,6
60767,1,161,4,883955467,10
75721,4,693,3,881957625,3
76699,3,674,3,888635995,4
92991,5,735,3,891308571,8
19864,2,135,4,884305150,1
80184,6,542,5,885606609,9
80917,7,1129,1,879544915,5


In [None]:
dls_item_test = CollabDataLoaders.from_df(ratings,bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,615,Forrest Gump (1994),4
1,280,"Terminator, The (1984)",3
2,792,"Birdcage, The (1996)",2
3,423,Red Corner (1997),4
4,178,Alien (1979),4
5,448,Bean (1997),1
6,413,Excess Baggage (1997),3
7,637,Primal Fear (1996),3
8,846,To Catch a Thief (1955),4
9,385,"Godfather, The (1972)",4


In [None]:
dls_item_test.x_names

(#2) ['user','movie']

In [None]:
dls.x_names

(#2) ['user','title']

In [None]:
n_factors_pb = np.argmax(ratings['rating'])
n_factors_pb = ratings.iloc[n_factors_pb]['rating']
n_factors_pb

5

In [None]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

# generate intial user and movie factors matrices
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [None]:
n_users, n_movies

(944, 1665)

In [None]:
user_factors, user_factors.shape, user_factors[2]

(tensor([[-1.0121,  0.3655,  1.2114, -1.9843, -0.0388],
         [ 0.0442,  1.5872,  0.6515,  0.6069,  0.8705],
         [-0.3740,  0.2867,  0.0123, -0.5935, -0.4335],
         ...,
         [ 1.7389, -1.0821,  0.9117,  0.2775, -0.5347],
         [ 0.5703,  0.6749, -0.8394,  1.8712,  0.5934],
         [ 1.1493, -0.5515, -0.0361, -1.7820, -1.5276]]),
 torch.Size([944, 5]),
 tensor([-0.3740,  0.2867,  0.0123, -0.5935, -0.4335]))

In [None]:
movie_factors, movie_factors.shape

(tensor([[ 1.9362,  0.1181, -0.1383,  0.2589, -2.8390],
         [ 0.0647,  0.6145, -0.2475, -0.4561,  1.0631],
         [ 0.1570, -1.1048,  0.0163, -0.7524, -0.1065],
         ...,
         [-1.3079, -0.3138,  1.6426,  2.0051, -0.5943],
         [-0.4056,  0.4642, -0.2313,  2.5167,  1.7877],
         [ 0.2071, -1.7562, -0.7170, -0.2199, -2.3932]]),
 torch.Size([1665, 5]))

Deep learning models don't know how to "look up at index" to check the corresponding index values between a movie latent factor matrix and a user latent factor matrix. If it could we could do the *dot product* of the two values to compare

The workaround is to represent *look up at an index* as a matrix product.

Trick is to replace our indices with *one-hot encoded* vectors

In [None]:
one_hot_2 = one_hot(2, n_users).float()
one_hot_2, one_hot_2.shape

(tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [None]:
user_factors.t(), user_factors

(tensor([[-1.0121,  0.0442, -0.3740,  ...,  1.7389,  0.5703,  1.1493],
         [ 0.3655,  1.5872,  0.2867,  ..., -1.0821,  0.6749, -0.5515],
         [ 1.2114,  0.6515,  0.0123,  ...,  0.9117, -0.8394, -0.0361],
         [-1.9843,  0.6069, -0.5935,  ...,  0.2775,  1.8712, -1.7820],
         [-0.0388,  0.8705, -0.4335,  ..., -0.5347,  0.5934, -1.5276]]),
 tensor([[-1.0121,  0.3655,  1.2114, -1.9843, -0.0388],
         [ 0.0442,  1.5872,  0.6515,  0.6069,  0.8705],
         [-0.3740,  0.2867,  0.0123, -0.5935, -0.4335],
         ...,
         [ 1.7389, -1.0821,  0.9117,  0.2775, -0.5347],
         [ 0.5703,  0.6749, -0.8394,  1.8712,  0.5934],
         [ 1.1493, -0.5515, -0.0361, -1.7820, -1.5276]]))

In [None]:
user_factors.t().shape, user_factors.shape

(torch.Size([5, 944]), torch.Size([944, 5]))

In [None]:
user_factors.t()

tensor([[-1.0121,  0.0442, -0.3740,  ...,  1.7389,  0.5703,  1.1493],
        [ 0.3655,  1.5872,  0.2867,  ..., -1.0821,  0.6749, -0.5515],
        [ 1.2114,  0.6515,  0.0123,  ...,  0.9117, -0.8394, -0.0361],
        [-1.9843,  0.6069, -0.5935,  ...,  0.2775,  1.8712, -1.7820],
        [-0.0388,  0.8705, -0.4335,  ..., -0.5347,  0.5934, -1.5276]])

This:

In [None]:
user_factors.t() @ one_hot_2

tensor([-0.3740,  0.2867,  0.0123, -0.5935, -0.4335])

Gives us the same result as this:

In [None]:
user_factors[2]

tensor([-0.3740,  0.2867,  0.0123, -0.5935, -0.4335])

Doing it this way would be fine but storing and searching that one-hot encoded vector is slow and hogs memory, ideally we should be able to directly index into an array using an integer

So most libraries, including pytorch provide a way to do this OOTB

- There is a special layer that does this, it indexes a vector using an integer
- But it does by calculating its derivative in such a way that it's identical to if we actually did the above one hot matrix multiplication

**This is called an embedding**

The thing you multiply the one-hot encoded matrix by in this case is called the *embedding matrix*

### Collaborative Filtering from Scratch

In [None]:
class Example:
    def __init__(self, a): 
        self.a = a
    def say(self, x): 
        return f'Hello {self.a}, {x}'

In [None]:
ex = Example("Peter")

In [None]:
ex.say('hello')

'Hello Peter, hello'

In [None]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range

    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

In [None]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [None]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.996949,0.988729,00:07
1,0.849827,0.900416,00:06
2,0.698534,0.858112,00:06
3,0.498979,0.862065,00:06
4,0.362767,0.866606,00:06
