# Collaborative Filtering
This notebook follows along with fast.ai chapter 8, with a similar dataset.

## Getting the data

In [1]:
from fastai.collab import *
from fastai.tabular.all import *

In [2]:
import pandas as pd
url = "https://github.com/zygmuntz/goodbooks-10k/blob/master/ratings.csv?raw=true"
ratings = pd.read_csv(url)
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [3]:
# Grab the book data so we can get titles
url = "https://github.com/zygmuntz/goodbooks-10k/blob/master/books.csv?raw=true"
books = pd.read_csv(url)
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m/2767052.jpg,https://images.gr-assets.com/books/1447303603s/2767052.jpg
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m/3.jpg,https://images.gr-assets.com/books/1474154022s/3.jpg
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m/41865.jpg,https://images.gr-assets.com/books/1361039443s/41865.jpg
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m/2657.jpg,https://images.gr-assets.com/books/1361975680s/2657.jpg
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m/4671.jpg,https://images.gr-assets.com/books/1490528560s/4671.jpg


In [4]:
books = books[["book_id", "original_title"]]
books.head()

Unnamed: 0,book_id,original_title
0,1,The Hunger Games
1,2,Harry Potter and the Philosopher's Stone
2,3,Twilight
3,4,To Kill a Mockingbird
4,5,The Great Gatsby


In [5]:
# Add the titles to our ratings
ratings = ratings.merge(books)
ratings.head()

Unnamed: 0,user_id,book_id,rating,original_title
0,1,258,5,La sombra del viento
1,2,4081,4,
2,2,260,5,How to Win Friends and Influence People
3,2,9296,5,Das Drama des begabten Kindes und die Suche nach dem wahren Selbst: eine Um- und Fortschreibung
4,2,2318,3,The Millionaire Next Door: The Surprising Secrets of America's Wealthy


In [6]:
# Creating the dataloader
dls = CollabDataLoaders.from_df(ratings, item_name='original_title', bs=64)
dls.show_batch()

Unnamed: 0,user_id,original_title,rating
0,7582,White Noise,4
1,32278,Black Beauty,5
2,5823,"The Twilight Collection (Twilight, #1-3)",2
3,30245,You Can Heal Your Life,5
4,42124,Shantaram,5
5,4071,Het Achterhuis: Dagboekbrieven 14 juni 1942 - 1 augustus 1944,4
6,51904,The Prodigal Daughter,5
7,27166,Breaking Dawn,5
8,5472,El juego del ángel,5
9,22462,The Heist,4


In [7]:
n_users  = len(dls.classes['user_id'])
n_books = len(dls.classes['original_title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
book_factors = torch.randn(n_books, n_factors)

## Creating a Model
Starting with the simplest dot product model

In [8]:
class DotProduct(Module):
    def __init__(self, n_users, n_books, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.book_factors = Embedding(n_books, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        books = self.book_factors(x[:,1])
        return (users * books).sum(dim=1)

In [9]:
model = DotProduct(n_users, n_books, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.579887,1.61814,07:09
1,1.504545,1.558202,05:57
2,1.169542,1.204554,05:58
3,0.817565,0.860654,05:55
4,0.72286,0.750372,05:48


In [10]:
# To improve the model, we can add biases and but the predictions through a sigmoid function
class DotProductBias(Module):
    def __init__(self, n_users, n_books, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.book_factors = Embedding(n_books, n_factors)
        self.book_bias = Embedding(n_books, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        books = self.book_factors(x[:,1])
        res = (users * books).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.book_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [11]:
model = DotProductBias(n_users, n_books, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(3, 5e-3, wd=0.1) 
# Specify weight decay which keeps the coefficents as small as possible and helps prevent overfitting

epoch,train_loss,valid_loss,time
0,1.132111,1.137334,06:43
1,1.034185,1.025449,06:50
2,0.896952,0.913163,06:39


## Creating Our Own Embedding Module

In [12]:
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [13]:
class DotProductBias(Module):
    def __init__(self, n_users, n_books, n_factors, y_range=(0,5.5)):
        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.book_factors = create_params([n_books, n_factors])
        self.book_bias = create_params([n_books])
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors[x[:,0]]
        books = self.book_factors[x[:,1]]
        res = (users*books).sum(dim=1)
        res += self.user_bias[x[:,0]] + self.book_bias[x[:,1]]
        return sigmoid_range(res, *self.y_range)

In [14]:
model = DotProductBias(n_users, n_books, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(3, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,1.110223,1.136308,07:14
1,1.011215,1.02621,07:04
2,0.915109,0.91413,07:13


## Interpreting Embeddings and Biases
The easiest part of our model to interpret is the bias. By looking at the lowest or highest biased books, we can see which books are considred to be the best and worst.

In [15]:
book_bias = learn.model.boook_bias.squeeze()
idxs = book_bias.argsort()[:5]
[dls.classes['original_title'][i] for i in idxs]

AttributeError: 'DotProductBias' object has no attribute 'boook_bias'

In [None]:
idxs = book_bias.argsort(descending=True)[:5]
[dls.classes['original_title'][i] for i in idxs]

## Using fastai.collab

In [None]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)

In [None]:
book_bias = learn.model.i_bias.weight.squeeze()
idxs = book_bias.argsort(descending=True)[:5]
[dls.classes['original_title'][i] for i in idxs]

### Embedding Disatance
We can use the distance between the vectors in our embedding matrix to find books that are similar to one another.

In [None]:
book_factors = learn.model.i_weight.weight
idx = dls.classes['original_title'].o2i['The Hunger Games']
distances = nn.CosineSimilarity(dim=1)(book_factors, book_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['original_title'][idx]

## Deep Learning for Collaborative Filtering

In [None]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [None]:
embs = get_emb_sz(dls)
model = CollabNN(*embs)

In [None]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(3, 5e-3, wd=0.01)

In [None]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(3, 5e-3, wd=0.1)