### [ GLOBAL IMPORTS ]

In [87]:
import pandas as pd
import numpy as np

# torch import : more compatible with m1
import torch
import torch.nn as nn
# from torch.nn import Module, Embedding

# fastai import : less compatible with m1
# @audit : why does the fastai version of Module and Embedding have better model
# performance than torch.nn vanilla import?
from fastai.torch_basics import Module, Embedding
from fastai.data.external import untar_data, URLs
from fastai.collab import CollabDataLoaders
# import and use fastai's collab learner instead of setting up our own
from fastai.collab import collab_learner
from fastai.learner import Learner
from fastai.losses import MSELossFlat
from fastai.torch_core import one_hot
from fastai.tabular.model import get_emb_sz

# super charged version of Python's list
# enhanced functionality and is used extensively throughout the fastai library
# built-in list does not have map to item by default
from fastcore.foundation import L

# get path to MovieLens data
path = untar_data(URLs.ML_100k)

# -- COLLABORATIVE FILTERING --

<iframe src="https://giphy.com/embed/MXX2jE0MChtYl1AgBe" width="480" height="480" frameBorder="0" class="giphy-embed" allowFullScreen></iframe><p><a href="https://giphy.com/gifs/fun-thumb-up-MXX2jE0MChtYl1AgBe">via GIPHY</a></p>

- Correlates user account preferences to similar accounts
- Predictively interpolate and fill in empty table ITEMS with values
- NN doesn't even need FACTORS : it learns the LATENT factors!

- These items can be :
    - products : movies, goods, services
    - url links
    - diagnosis
    - ... etc

### Product Applications

- recommendation systems
    - Netflix : movies rated
    - Amazon  : products viewed/bought
- social post feeds
    - Twitter : scroll speed, post click, likes
    - Tik Tok : likes?

### Key Concept : Latent Factors

Examples of Netflix Latent Concepts :

- science fiction
- action heavy
- 1970's

<iframe src="https://giphy.com/embed/yznEXxtq7wQlG" width="480" height="204" frameBorder="0" class="giphy-embed" allowFullScreen></iframe><p><a href="https://giphy.com/gifs/yznEXxtq7wQlG">via GIPHY</a></p>

There is surprisingly little difference between :
- Explicitly SPECIFYING a Latent Factors
- Implicitly LEARNING with a GENERAL gradient descent approach 


In [94]:
# Explicitly SPECIFYING Latent Factors
'''
array index :
    - sci-fi
    - action
    - old movies

Range -1 to 1 where :
    - positive number indicates STRONGER match
    - negative number indicates WEAKER match
'''

# user ID
user1 = np.array([0.90, 0.80, -0.60]) # user who likes modern sci-fi action

# movie ID
rise_skywalker = np.array([0.98, 0.90, -0.90]) # Rise of Skywalker movie
casablanca = np.array([-0.99, -0.30, 0.80])    # Casablanca

# Dot Product :
#   - Multiply two VECTORS
#   - Add up the results
print(f"user1 * skywalker  => [ {(user1 * rise_skywalker).sum()} ]")
print(f"user1 * casablanca => [ {(user1 * casablanca).sum()} ]")

user1 * skywalker  => [ 2.1420000000000003 ]
user1 * casablanca => [ -1.611 ]


In [None]:
# Implicitly LEARNING with a GENERAL gradient descent approach

# Step 1 :
# - randomly initialize parameters
# - parameters are a set of latent factors for userID and movieID

# Step 2 :
# - calculate our predictions
# - dot product userID and movieID
#   - GREATER product if either :
#       - userID.action is HIGH and movieID.action HIGH
#       - userID.action is LOW and movieID.action LOW
#   - LOWER product if either :
#       - userID.action is LOW while movieID.action is HIGH
#       - userID.action is HIGH while movieID.action is LOW

# Step 3 :
# - calculate our loss
#   - use MEAN SQUARED ERROR as a reasonable starting point
#   - but almost any loss function works ... @audit : EXPLAIN THIS!

# -- MOVIELENS DATA SET --

- Set contains :
    - movie ID
    - user ID
    - rating
    - timestamp

- Full set has 25 million entries
    - We will work with 100,000 of them

- Latent PREFERENCES for user ID our model are likely to predict
    - genre
    - preferred director
    - preferred actor
    - age
    - etc...

#### read csv

In [70]:
# panda is commonly used to work with structured data in tabular form
# - csv, xlsx

# read in a csv file
ratings = pd.read_csv(
    # read the 'u.data' csv file at the MovieLens data path
    path/'u.data',
    # default is comma separated, but MovieLens is tab separated
    delimiter='\t',
    # csv does NOT have a header row
    header=None,
    # use these as column names
    names=['user', 'movie', 'rating', 'timestamp']
)

# displays the first N (default=5) rows of ratings table
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


#### log movie titles

In [71]:
movies = pd.read_csv(
    path/'u.item',
    delimiter='|',
    encoding='latin-1',
    usecols=(0,1),
    names=('movie', 'title'),
    header=None
)

movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


#### log movie rating

In [77]:
# @audit : Explain how this merge works, how come it's organized by movie
# instead of user?
ratings=ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


### dls => show_batch()

In [86]:
dls=CollabDataLoaders.from_df(
    ratings,
    # @audit : what other item_name can we use?
    item_name='title',
    bs=64
)

dls.show_batch()

Unnamed: 0,user,title,rating
0,472,White Squall (1996),2
1,624,Kids in the Hall: Brain Candy (1996),3
2,463,Down Periscope (1996),1
3,181,Sleepers (1996),3
4,506,Star Trek: The Wrath of Khan (1982),5
5,588,Snow White and the Seven Dwarfs (1937),5
6,62,Blue in the Face (1995),3
7,398,"Quiet Man, The (1952)",5
8,23,Fargo (1996),5
9,851,I'm Not Rappaport (1996),2


### n_users n_movies n_factors

In [6]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [7]:
# we need to manually cast 3 as a tensor because we are using torch one_hot
# not fastai one_hot
one_hot_3 = one_hot(torch.tensor(3), n_users).float()
# print actual index look up
print(user_factors[3])
# should be equal to look up as a matrix operation using one hot encoding
user_factors.t() @ one_hot_3

tensor([-0.0949,  0.0252,  0.8279, -0.2113,  0.3091])


tensor([-0.0949,  0.0252,  0.8279, -0.2113,  0.3091])

# -- MODEL ARCHETECTURE --

### [ DOT PRODUCT ] -- default

In [8]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        # Because we are using torch Module not fastai's
        super().__init__() # Add this line
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)

    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [9]:
x,y = dls.one_batch()
# @audit : Explain why is x [64, 2] and y [64, 1]
# Guessing that independent (x) is user, movie 
# ... so what is dependent returning (y) ?
print(f"x : [{x.shape}] y : [{y.shape}]")

x : [torch.Size([64, 2])] y : [torch.Size([64, 1])]


In [10]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [11]:
# @audit : refactoring fastai => pytorch has DRASTICALLY model perf, why?
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.346614,1.294557,00:02
1,1.088729,1.081342,00:02
2,0.99438,0.978127,00:02
3,0.843148,0.883507,00:02
4,0.808667,0.866061,00:02


In [12]:
def sigmoid_range_mt(x, low, high):
    "Sigmoid function with range `(low, high)`"
    return torch.sigmoid(x) * (high - low) + low

### [ DOT PRODUCT ] -- sigmoid_range

In [13]:
# @audit : Improve model by forcing prediction between 0 and 5 somehow?

class DotProductM(Module):
    def __init__(
        self, 
        n_users, 
        n_movies, 
        n_factors,
        y_range=(0, 5.5)
    ):
        super().__init__()
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        # @audit : WTF ... why does manually copyting the sigmoid_range
        # fastai code work with m1 chip ... but NOT when calling function
        # from fastai library?  What xform is fastai applying that is 
        # preventing m1 from working LOL
        return sigmoid_range_mt((users*movies).sum(dim=1), *self.y_range)

In [14]:
model = DotProductM(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [15]:
# @audit : refactoring fastai => pytorch has DRASTICALLY model perf, why?
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.994168,0.964676,00:02
1,0.850616,0.890451,00:02
2,0.679248,0.857362,00:02
3,0.468255,0.857912,00:02
4,0.36066,0.86169,00:02


### [ DOT PRODUCT ] -- bias

In [16]:
# @audit : Explain why we are adding bias

class DotProductBias(Module):
    def __init__(
        self,
        n_users,
        n_movies,
        n_factors,
        y_range = (0, 5.5)
    ):
        super().__init__()
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        # Jupyter cell is crashing here, and the difference is keepdim
        # @audit : Explain why
        # res = (users*movies).sum(dim=1, keepdim=True)
        res = (users*movies).sum(dim=1)
        # print(f"res : presqueeze [{res.shape}]")
        res = res.unsqueeze(1) # equivalent to keepdim=True
        # print(f"res : unsqueeze [{res.shape}]")
        # res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1]) # @audit : crashing here??
        bias = self.user_bias(x[:,0]) + self.movie_bias(x[:,1]) # hmmm this is ok ...
        # res = res + self.user_bias(x[:,0]) # but is this ok? NO this crashes
        # res = res + self.movie_bias(x[:,1]) # howabout this? NO also crashes but not immediately
        # print(f"res shape [{res.shape}] bias shape [{bias.shape}]")
        # res += bias # crashing
        res = res + bias # crashing
        # return sigmoid_range_mt(res, *self.y_range)
        return res # maybe this is OK somehow? NOPE does NOTHING to change outcome


In [17]:
user_factors = Embedding(n_users, n_factors)
user_bias = Embedding(n_users, 1)
users = user_factors(x[:,0])

In [18]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [19]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,1.245302,1.228191,00:03
1,1.056693,1.070815,00:02
2,0.934575,0.948646,00:02
3,0.804969,0.860907,00:02
4,0.765749,0.847356,00:02


#### fastai cpu variant

In [20]:
# Do fastai on CPU
class DotProductBiasFAI(Module):
    def __init__(
        self,
        n_users,
        n_movies,
        n_factors,
        y_range = (0, 5.5)
    ):
        super().__init__()
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users*movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1]) # @audit : crashing here??
        return sigmoid_range_mt(res, *self.y_range)


In [21]:
model = DotProductBiasFAI(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [22]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.950105,0.923197,00:03
1,0.8218,0.848295,00:02
2,0.594166,0.84891,00:02
3,0.395843,0.871276,00:02
4,0.301095,0.877649,00:02


In [23]:
model = DotProductBiasFAI(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [24]:
learn.fit_one_cycle(5, 5e-3, wd=-0.1)

epoch,train_loss,valid_loss,time
0,0.932985,0.915409,00:02
1,0.822885,0.915387,00:03
2,0.627315,1.21255,00:03
3,0.439667,1.444548,00:02
4,0.317416,1.490096,00:02


### [ EMBEDDINGS ]

#### L is a super charged List

In [25]:
a = L(range(10))
print(a) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [26]:
class T(Module):
    def __init__(self):
        self.a = torch.ones(3)

L(T().parameters())

(#0) []

In [27]:
class T(Module):
    def __init__(self):
        self.a = nn.Parameter(torch.ones(3))

L(T().parameters())

(#1) [Parameter containing:
tensor([1., 1., 1.], requires_grad=True)]

In [28]:
class T(Module):
    def __init__(self):
        self.a = nn.Linear(1,3, bias=False)

t = T()
L(t.parameters())

(#1) [Parameter containing:
tensor([[-0.3875],
        [-0.8044],
        [ 0.8442]], requires_grad=True)]

In [29]:
type(t.a.weight)

torch.nn.parameter.Parameter

#### Create Parameter with Random Init

In [30]:
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [31]:
class DotProductBiasEmb(Module):

    def __init__(
            self, 
            n_users,
            n_movies,
            n_factors,
            y_range=(0, 5.5)
        ):
        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])
        self.y_range = y_range

    def forward(self, x):
        users = self.user_factors[x[:,0]]
        movies = self.movie_factors[x[:,1]]
        res = (users*movies).sum(dim=1)
        res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
        return sigmoid_range_mt(res, *self.y_range)

In [32]:
model = DotProductBiasEmb(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())

In [33]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.930217,0.937655,00:03
1,0.854108,0.865882,00:03
2,0.732167,0.822986,00:02
3,0.572907,0.81155,00:02
4,0.478006,0.810909,00:02


#### Interpreting Embeddings and Bias

In [34]:
# Extract movie bias terms from the model
# squeeze() removes any dimension of size one
# - effectively returns a single array of bias value
# - as opposed to an array of array with size one of bias values
# - squeeze 2d tensor to 1d tensor, removes any unnecessary dimensions
movie_bias = learn.model.movie_bias.squeeze()
# select the first 5 indices of movies with the smallest bias term
idxs = movie_bias.argsort()[:5]
# use idxs to lookup movie titles in dls.classes['title'] column
[dls.classes['title'][i] for i in idxs]

['Children of the Corn: The Gathering (1996)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'Robocop 3 (1993)',
 'Cable Guy, The (1996)',
 'Crow: City of Angels, The (1996)']

In [35]:
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Titanic (1997)',
 'Shawshank Redemption, The (1994)',
 'L.A. Confidential (1997)',
 'Silence of the Lambs, The (1991)',
 'As Good As It Gets (1997)']

### USING [ FASTAI COLLAB ]

In [40]:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))

In [41]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.974395,0.933383,00:03
1,0.895607,0.866923,00:02
2,0.759594,0.821885,00:02
3,0.598035,0.806962,00:02
4,0.494833,0.806945,00:03


In [43]:
# The names of the layers can be seen by printing the model
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 50)
  (i_weight): Embedding(1665, 50)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1665, 1)
)

In [45]:
# We can replicate any previous analysis that we did with our previous models
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

['Shawshank Redemption, The (1994)',
 'Titanic (1997)',
 'L.A. Confidential (1997)',
 'Star Wars (1977)',
 'Silence of the Lambs, The (1991)']

In [62]:
movie_factors = learn.model.i_weight.weight
idx = dls.classes['title'].o2i['Silence of the Lambs, The (1991)']
# idx = dls.classes['title'].o2i['Star Wars (1977)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

'Go Fish (1994)'

In [64]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [65]:
class CollabNN(Module):
    def __init__(
            self,
            user_sz,
            item_sz,
            y_range=(0, 5.5),
            n_act=100
    ):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1)
        )
        self.y_range = y_range

    def forward(
            self,
            x
    ):

        embs =self.user_factors(
            x[:,0], self.item_factors(x[:,1])   
        )
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range_mt(x, *self.y_range)

In [66]:
collab = CollabNN(*embs)

In [67]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

epoch,train_loss,valid_loss,time
0,0.489128,0.871878,00:03
1,0.498628,0.95046,00:02
2,0.338813,0.995866,00:03
3,0.223272,1.015341,00:02
4,0.138106,1.021616,00:02


In [68]:
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5))
layers = [100, 50]
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.994689,0.970684,00:04
1,0.910055,0.893291,00:04
2,0.854469,0.866114,00:04
3,0.832707,0.841717,00:04
4,0.780209,0.844169,00:04


# -- CONCLUSION --

For our first non-computer vision application, we looked at recommendation
systems and saw how gradient descent can learn intrinsic factors or biases
about items from a history of ratings.  Those can give us information about
the data.

We also built our first model in Pytorch.  We will do a lot more of this in
the next section of the book, but first, let's finish our dive into the other
general applications of deep learning, continuing with tabular data

#### Questionnaire

- 1 - What problem does collaborative filtering solve?

- 2 - How does it solve it

- 3 - Why might a collaborative filtering predictive model fail to be a very
useful recommendation system?

- 4 - What does a crosstab representation of collaborative filtering data look
like?

- 5 - Write the code to create a crosstab representation of the MovieLens data
(you might need to do some web searching!)

- 6 - What is a latent factor?  Why is it "latent"?

- 7 - What is a dot product?  Calculate a dot product manually using pure Python
with lists.

- 8 - What does pandas.DataFrame.merge do?

- 9 - What is an embedding matrix?

- 10 - What is the relationship between an embedding and a matrix of
one-hot-encoded vectors?

- 11 - Why do we need Embedding if we could use one-hot-encoded vectors for the
same thing?

- 12 - What does an embedding contain before we start training (assuming we're
not using a pretrained model)?

- 13 - Create a class (without peeking, if possible!) and use it

- 14 - What does x[:, 0] return?

```sh
ANSWER : ALL the ELEMENTS in the first COLUMN (in the context of a 2D array or
tensor)
- : specifies that we want all the elements along this dimension
- 0 specifies the index along the second dimension (1st column in this case)
```

- 15 - Rewrite the `DotProduct` class (without peeking, if possible!) and train
a model with it

- 16 - What is a good loss function to use for MovieLens?  Why?

- 17 - What would happen if we used cross-entropy loss with MovieLens?  How
would we need to change the model?

