In [None]:
!pip install fastai
import pandas as pd
import numpy as np
import torch
import fastai

import sklearn
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
print(dict(
    torch=torch.__version__, 
    fastai=fastai.__version__, 
    pandas=pd.__version__, 
    numpy=np.__version__))

# Embeddings

Embeddings are a critical tool for making neural networks efficient, especially when it comes to tabular data.  Embeddings allow NN to be as powerful and tree ensemble methods for tabular data.  

When a table contains raw data (text or images) or have very high cardinality categorical features it is generally recommended to use NN instead of random forests, for example.

An embedding may be thought of as a combination of one-hot-encoding + dimensionality reduction.

In [None]:
import torch

# Collaborative Filtering

Collaborative filtering uses embeddings to find latent factors connecting categorical or labeled inputs and outputs (for example, usernames vs. movie titles).  The hypothesis is that there is a structured latent space defined by these embeddings in which similar points reflect similar users or movies.

## Probabilistic Matrix Factorization Approach

This is from ["Deep learning for coders with fastai & pytorch"](https://www.amazon.com/Deep-Learning-Coders-fastai-PyTorch/dp/1492045527) (Chapter 8).

Here we are trying to find the latent factors controlling the connection between users and the movies the like.

We will use a very simple metric (dot product) to define matches / similarity. This approach to collaborative filtering is called [probabilistic matrix factorization](https://towardsdatascience.com/probabilistic-matrix-factorization-b7852244a321).

Here is a nice introduction from [google](https://developers.google.com/machine-learning/recommendation/collaborative/basics).  One thing google points out is that you can improve things by [weighting observed](https://developers.google.com/machine-learning/recommendation/collaborative/matrix) entries differently that unobserved ones during training (which will make sense soon).  The example below is as simpler implementation whcih does not do this.

Some [limitations](https://developers.google.com/machine-learning/recommendation/dnn/softmax), which can be overcome by using NN (next section), include:
> 1. This only works on the training data - you cannot make predictions on new, unseen data.  In principle, if you had another model (e.g., NN) to predict the user embedding based on other features you should be able to use the filtering model to make predictions, but this adds another layer - using a NN directly essentially accomplishes the same thing in one step.
> 2. "Popular items tend to be recommended for everyone, especially when using dot product as a similarity measure." User-specific interests are not well-tailored.

In [None]:
from fastai.collab import *
from fastai.tabular.all import *

In [None]:
path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user', 'movie', 'rating', 'timestamp'])
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1', usecols=(0,1), names=('movie', 'title'), header=None)

In [None]:
movies.head()

In [None]:
ratings.head()

In [None]:
ratings = ratings.merge(movies) # merge based on movie (common column)

In [None]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64) # Make a dataloader from the dataframe
dls.show_batch()

In [None]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5 # Arbitrary choice for this example - in reality, this is a hyperparameter for tuning

# Let's set up the latent factors we are going to learn
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)

In [None]:
?sigmoid_range

In [None]:
f = sigmoid_range(torch.tensor([-100,0,1,2,3,4,5,100]), 1, 10)

In [None]:
lower = 1
upper = 10
x = torch.tensor(np.linspace(-10, +10, 1000))
y = sigmoid_range(x, lower, upper) # This creates a response variable that is stretched out

plt.plot(x,y)

In [None]:
# Let's do this using fastai's built in Embedding module

class DotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = Embedding(n_users, n_factors)
    self.user_bias = Embedding(n_users, 1)
    self.movie_factors = Embedding(n_movies, n_factors)
    self.movie_bias = Embedding(n_movies, 1)
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors(x[:,0])
    movies = self.movie_factors(x[:,1])
    # A dot product reflects the similarity between 2 vectors so is a convenient "matching" approach
    res = (users * movies).sum(dim=1, keepdim=True) + self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
    return sigmoid_range(res, *self.y_range)

model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat()) 

In [None]:
learn.fit_one_cycle(5, 5e-3, wd=0.1 )# Use weight decay (l2 norm) to regularize

In [None]:
# To be a little more transparent we can make our own "Embedding" module
def create_params(size):
  # nn.Parameter tells pytorch this is a trainable parameter
  # This also randomizes them
  return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

class ManualDotProductBias(Module):
  def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
    self.user_factors = create_params([n_users, n_factors])
    self.user_bias = create_params([n_users])
    self.movie_factors = create_params([n_movies, n_factors])
    self.movie_bias = create_params([n_movies])
    self.y_range = y_range

  def forward(self, x):
    users = self.user_factors[x[:,0]]
    movies = self.movie_factors[x[:,1]]
    # A dot product reflects the similarity between 2 vectors so is a convenient "matching" approach
    res = (users * movies).sum(dim=1) + self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
    return sigmoid_range(res, *self.y_range)

model = ManualDotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat(), wd=0.1) # Use weight decay (l2 norm) to regularize

In [None]:
learn.fit_one_cycle(5, 5e-3, wd=0.1) # Use weight decay (l2 norm) to regularize

In [None]:
# The bias tells us what movies are just "bad" - i.e., even if the dot product is high (movie well matched to user's general taste), a low bias means they probably still won't like it
# Conversely, high biases can show movies that are very "good" and even users who don't normally like a certain type of movie will probably still enjoy it.

movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort()[:5]
bad_movies = [dls.classes['title'][i] for i in idxs]

idxs = movie_bias.argsort(descending=True)[:5]
good_movies = [dls.classes['title'][i] for i in idxs]

In [None]:
bad_movies

In [None]:
good_movies

In [None]:
# We can look at the latent space

pca = PCA(n_components=2)

factors = learn.model.movie_factors.detach().numpy()
look_at = 20 # Look at the "best" N movies

idxs_best = movie_bias.argsort(descending=True)[:look_at]
mask = np.array([True if i in idxs_best else False for i in range(n_movies)])
low_d_best = pca.fit_transform(factors[mask])

idxs_worst = movie_bias.argsort(descending=False)[:look_at]
mask = np.array([True if i in idxs_worst else False for i in range(n_movies)])
low_d_worst = pca.fit_transform(factors[mask])

In [None]:
plt.figure(figsize=(10,10))
plt.plot(low_d_best[:,0], low_d_best[:,1], 'go')
for i in range(len(low_d_best)):
  plt.text(low_d_best[i,0], low_d_best[i,1], dls.classes['title'][idxs_best[i]])

plt.plot(low_d_worst[:,0], low_d_worst[:,1], 'ro')
for i in range(len(low_d_worst)):
  plt.text(low_d_worst[i,0], low_d_worst[i,1], dls.classes['title'][idxs_worst[i]])

In [None]:
# Fastai does the same thing

learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)

In [None]:
learn.model

In [None]:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

Importantly, the latent space can be used to define distances, for example - similar things should be close in this space (we hope), so we can define recommendations, or authentication/predictions, based on similarity to a known exemplar.

## Deep Neural Network Approach

This is from ["Deep learning for coders with fastai & pytorch"](https://www.amazon.com/Deep-Learning-Coders-fastai-PyTorch/dp/1492045527) (Chapter 8).

Now, we will use a neural network to predict the latent similarity.  This is achieved by concatenating the factors of the movies and the users to create a single input, then a NN predicts the "degree of recommendation / compatability" based on that net input.

Google also has a nice example [here](https://developers.google.com/machine-learning/recommendation/dnn/softmax).