In [None]:
'''
    Author: Huyen Nguyen
    Date created: 30/06/2021
'''
!pip install -Uqq fastbook
import fastbook
from fastbook import *

In [None]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

In [None]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

In [None]:
ratings = ratings.merge(movies)
ratings.head()

##Question 1##
a. Use the documentation of CollabDataLoaders to complete the code below, using 'title' as the item name and a batch size of 128) https://docs.fast.ai/collab.html#CollabDataLoaders.from_df


In [None]:
dls = CollabDataLoaders.from_df(   ???    )
dls.show_batch()

b. (Optional) Can you rearrange the ```ratings``` dataframe into this form https://pbpython.com/pandas-crosstab.html ![](https://drive.google.com/uc?export=view&id=1nTAwITd33VXvYPszak8DUqT7-3Q_ElcU)





c. How sparse is this matrix? (i.e: the number of ratings divided by all possible user-movie combinations)

In [None]:
???

## Question 2## 
a. The step below defines the model including the forward pass and creates the embedding matrices for users and movies. Please fill in the correct dimensions of the matrices. 

b. 
Can you explain what is happening in these steps? 

```
     def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)
```

c. What's the meaning of ```y_range=(0,5.5)``` and why do we want that constraint?

d. Why do we use MSELossFlat() as the loss function instead of cross entropy?


In [None]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(???, ???)
        self.user_bias = Embedding(???, ???)
        self.movie_factors = Embedding(???, ???)
        self.movie_bias = Embedding(???, ???)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [None]:
model = DotProductBias(???, ???, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [None]:
learn.fit_one_cycle(3, 5e-3)


e. The code below counts the number of weights in your model, can you explain the result and calculate it yourself?

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

## Question 3 (Optional)##
Can you get one batch of data from dls and show the output of the model?

In [None]:
x,y = ???
output = ???
output

User 100 has never seen movie 250, can you show what the predicted rating would be?

In [None]:
???

Can you plot the output of the model on the validation data vs the actual ratings?

In [None]:
???

## Question 4 (Optional)##
```movie_bias``` captures the inherent popularity of the movies. Can you display the top 5 most popular titles?

In [None]:
movie_bias = ???
idxs = movie_bias.argsort(???)[:5]
[dls.classes['title'][i] for i in idxs]

## Question 5##
a. What are the arguments user_sz and item_sz in the model below?

b. Why is the dimension of the linear layer ```nn.Linear(user_sz[1]+item_sz[1], n_act)```?

c. What happens in the forward pass?

In [None]:
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

## Question 6##
Based on fastai documentation, what does this step ```embs = get_emb_sz(dls)``` do? 

In [None]:
embs = get_emb_sz(dls)
model = CollabNN(*embs)

In [None]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

## Collaborative filtering the fastai way ##

## Question 7##
a. What does ```n_factors``` mean?

In [None]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(3, 5e-3, wd=0.1)

b. Can you define a model with 2 hidden layers with 50,20 nodes. 

In [None]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=???)
learn.fit_one_cycle(3, 5e-3, wd=0.1)

c.(hard - optional) What is the number of parameters in your model? Can you explain the result. 
Note: the RELU layers do not have any weights. 
The BatchNorm layers ```BatchNorm1d(n)``` has 2*n weights https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html

In [None]:
learn.model

In [None]:
count_parameters(learn.model)

## Question 8 (hard/optional but you'll learn a great deal from doing it)
a. If we want to turn this regression task into a classification task using the CollabNN model above, what changes do we have to make?

b. Can you implement it yourself? What do you observe about the training of this new model? 