In [1]:
from fastai.collab import *
from fastai.tabular.all import *

# Collaborative filtering

In [2]:
path = untar_data(URLs.ML_100k)
path.ls()

(#23) [Path('/storage/data/ml-100k/u4.test'),Path('/storage/data/ml-100k/u1.base'),Path('/storage/data/ml-100k/u4.base'),Path('/storage/data/ml-100k/u.info'),Path('/storage/data/ml-100k/u2.base'),Path('/storage/data/ml-100k/README'),Path('/storage/data/ml-100k/u.item'),Path('/storage/data/ml-100k/u.data'),Path('/storage/data/ml-100k/u5.test'),Path('/storage/data/ml-100k/u5.base')...]

In [3]:
ratings = pd.read_csv(path/'u.data', sep='\t', header=None, names=['user', 'movie', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# DataLoaders

In [4]:
movies = pd.read_csv(path/'u.item', encoding='latin-1', delimiter='|', header=None, usecols=(0,1), names=('movie', 'title'))
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,938,Face/Off (1997),4
1,480,Babe (1995),5
2,112,Kiss the Girls (1997),4
3,655,Fools Rush In (1997),2
4,493,Swingers (1996),5
5,733,Georgia (1995),4
6,737,Seven (Se7en) (1995),3
7,880,Batman Returns (1992),2
8,279,Showgirls (1995),1
9,524,Mighty Aphrodite (1995),4


In [7]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

n_users, n_movies

(944, 1665)

# Model from scratch

In [8]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        # first column contains user ids
        users = self.user_factors(x[:,0])
        
        # second column contains movie ids
        movies = self.movie_factors(x[:,1])
        
        return (users*movies).sum(dim=1)

In [9]:
x,y = dls.one_batch()
x.shape

torch.Size([64, 2])

In [10]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [11]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.361865,1.27657,00:11
1,1.077682,1.096708,00:11
2,0.985228,0.977119,00:11
3,0.818475,0.884883,00:11
4,0.780821,0.865171,00:11


Help the model a bit by forcing the prediction to be in the correct range:

In [12]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        # first column contains user ids
        users = self.user_factors(x[:,0])
        
        # second column contains movie ids
        movies = self.movie_factors(x[:,1])
        
        return sigmoid_range((users*movies).sum(dim=1), *self.y_range)

In [13]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.01528,0.969445,00:11
1,0.875156,0.883015,00:11
2,0.677835,0.861358,00:11
3,0.472456,0.868314,00:11
4,0.361848,0.873076,00:11


Some users are just more negativ/positive than others. Same for movies, some are just in general better/worse. This can be modelled by adding a bias:

In [32]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
        self.user_bias = Embedding(n_users, 1)
        self.movies_bias = Embedding(n_movies, 1)
        
        self.y_range = y_range
        
    def forward(self, x):
        # first column contains user ids
        users = self.user_factors(x[:,0])
        
        # second column contains movie ids
        movies = self.movie_factors(x[:,1])
        
        res = (users*movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movies_bias(x[:,1])
        
        return sigmoid_range(res, *self.y_range)

In [33]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.957165,0.93278,00:12
1,0.833389,0.846126,00:13
2,0.607541,0.853311,00:12
3,0.389398,0.875636,00:13
4,0.287767,0.883028,00:12


Weight decay against overfitting:

In [34]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=.1)

epoch,train_loss,valid_loss,time
0,0.98476,0.930608,00:12
1,0.832384,0.854933,00:12
2,0.750734,0.819578,00:12
3,0.59304,0.807778,00:12
4,0.495993,0.810094,00:12


## Embedding from scratch

In [35]:
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [58]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = create_params((n_users, n_factors))
        self.movie_factors = create_params((n_movies, n_factors))
        
        self.user_bias = create_params((n_users, 1))
        self.movies_bias = create_params((n_movies, 1))
        
        self.y_range = y_range
        
    def forward(self, x):
        # first column contains user ids
        users = self.user_factors[x[:,0]]
        
        # second column contains movie ids
        movies = self.movie_factors[x[:,1]]
        
        res = (users*movies).sum(dim=1, keepdim=True)
        res += self.user_bias[x[:,0]] + self.movies_bias[x[:,1]]
        
        return sigmoid_range(res, *self.y_range)

In [59]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=.1)

epoch,train_loss,valid_loss,time
0,0.94548,0.938534,00:12
1,0.860198,0.870919,00:11
2,0.749549,0.818406,00:11
3,0.578061,0.808568,00:11
4,0.474516,0.809301,00:11


# Interpretation

## Bias

In [60]:
movie_bias = learn.model.movies_bias.squeeze()
movie_bias.shape

torch.Size([1665])

In [61]:
worst_5 = movie_bias.argsort()[:5]
print('\n'.join(learn.dls.classes['title'][worst_5]))

Children of the Corn: The Gathering (1996)
Cable Guy, The (1996)
Lawnmower Man 2: Beyond Cyberspace (1996)
Robocop 3 (1993)
Bloodsport 2 (1995)


In [62]:
best_5 = movie_bias.argsort()[-5:]
print('\n'.join(learn.dls.classes['title'][best_5]))

L.A. Confidential (1997)
Star Wars (1977)
Schindler's List (1993)
Titanic (1997)
Shawshank Redemption, The (1994)


In [63]:
best_5

tensor([ 830, 1399, 1282, 1501, 1318])

In [51]:
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


This is actually more meaningful than just taking the best and worst rated movies:

In [55]:
mean_ratings = ratings.groupby('title').mean()
mean_ratings.sort_values('rating', inplace=True)
mean_ratings.head()

Unnamed: 0_level_0,user,movie,rating,timestamp
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Eye of Vichy, The (Oeil de Vichy, L') (1993)",405.0,1562.0,1.0,885549506.0
Butterfly Kiss (1995),551.0,1621.0,1.0,892785194.0
Daens (1992),405.0,1565.0,1.0,885549463.0
JLG/JLG - autoportrait de décembre (1994),181.0,1366.0,1.0,878962200.0
Touki Bouki (Journey of the Hyena) (1973),405.0,1571.0,1.0,885549463.0


In [56]:
mean_ratings.tail()

Unnamed: 0_level_0,user,movie,rating,timestamp
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Someone Else's America (1995),437.0,1599.0,5.0,880142600.0
Star Kid (1997),384.666667,1293.0,5.0,888718300.0
Santa with Muscles (1996),530.0,1500.0,5.0,877470400.0
Prefontaine (1997),345.0,1189.0,5.0,888537800.0
Marlene Dietrich: Shadow and Light (1996),90.0,1201.0,5.0,891383700.0


## Distance

In [76]:
movie_id = 1399
title = learn.dls.classes['title'][movie_id]

embedding = learn.model.movie_factors[movie_id]
distances = ((learn.model.movie_factors - embedding)**2).mean(dim=1)
closest_ids = distances.argsort()[:5]
closest_titles = learn.dls.classes['title'][closest_ids]

print('Movies closest to: ', title)
print('')
print('\n'.join(closest_titles))

Movies closest to:  Star Wars (1977)

Star Wars (1977)
Empire Strikes Back, The (1980)
Return of the Jedi (1983)
Raiders of the Lost Ark (1981)
Princess Bride, The (1987)


# With fast.ai premade learner

In [57]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=.1)

epoch,train_loss,valid_loss,time
0,0.96476,0.930992,00:12
1,0.850207,0.86474,00:13
2,0.728615,0.822953,00:12
3,0.60146,0.807646,00:12
4,0.504141,0.807711,00:13


# With DeepLearning

In [78]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [79]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1)
        )
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]), self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [80]:
model = CollabNN(*embs)

In [81]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.935815,0.944403,00:14
1,0.89627,0.886026,00:14
2,0.870257,0.866168,00:14
3,0.808649,0.854973,00:14
4,0.774447,0.858966,00:14
