In [1]:
import fastbook
fastbook.setup_book()

In [2]:
from fastbook import *

### Collaborative Filtering Deep Dive

Collaborative filtering usually calls thing *items* rather than *products*

Key foundational idea is *latent factors* 

#### A First Look at the Data

We're going to use a dataset called [MovieLens](https://grouplens.org/datasets/movielens/)

- Contains 10s of millions of rankings(a combo of a movie id, user id, and numeric rating)

- Just going to use on a subset of 100k for this

- Good practice to go back and try this on the full set at some point

In [3]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [8]:
path.ls()

(#276) [Path('/home/petewin/.fastai/data/ml-100k/u2.base'),Path('/home/petewin/.fastai/data/ml-100k/u.genre'),Path('/home/petewin/.fastai/data/ml-100k/u.user'),Path('/home/petewin/.fastai/data/ml-100k/u3.test'),Path('/home/petewin/.fastai/data/ml-100k/ub.test'),Path('/home/petewin/.fastai/data/ml-100k/u.occupation'),Path('/home/petewin/.fastai/data/ml-100k/ua.test'),Path('/home/petewin/.fastai/data/ml-100k/u5.base'),Path('/home/petewin/.fastai/data/ml-100k/u4.test'),Path('/home/petewin/.fastai/data/ml-100k/u.info')...]

In [10]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Above has everything we need, there's also cross-tabulated data that's more human-readable

In the cross-tab format we can see that users have rated some movies, not others, we're going to try to predict how much they would enjoy a given movie that they have not yet rated

In [11]:
last_skywalker = np.array([0.98, 0.9, -0.9])

In [12]:
user1 = np.array([0.9, 0.8, -0.6])

In [13]:
(user1 * last_skywalker).sum()

2.1420000000000003

When multiplying two vectors together and adding up results, it's known as the *dot product*

In [14]:
casablanca = np.array([-0.99, -0.3, 0.8])

In [15]:
(user1 * casablanca).sum()

-1.611

In [17]:
user1 * casablanca

array([-0.891, -0.24 , -0.48 ])

In [18]:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie', 'title'), header=None)
movies.head()                     

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Merge with ratings table

In [19]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [30]:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,90,Arsenic and Old Lace (1944),4
1,770,Face/Off (1997),4
2,881,North by Northwest (1959),4
3,13,Victor/Victoria (1982),1
4,774,"Graduate, The (1967)",1
5,798,Ghost (1990),3
6,294,"Lost World: Jurassic Park, The (1997)",4
7,498,"NeverEnding Story III, The (1994)",2
8,75,Executive Decision (1996),4
9,178,"Usual Suspects, The (1995)",5


In [34]:
dls_item_test = CollabDataLoaders.from_df(ratings, bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,666,Scream (1996),3
1,128,My Fair Lady (1964),3
2,385,"Fugitive, The (1993)",3
3,894,"Secret of Roan Inish, The (1994)",4
4,434,"First Wives Club, The (1996)",4
5,500,"African Queen, The (1951)",4
6,188,Primal Fear (1996),5
7,897,"Indian in the Cupboard, The (1995)",3
8,764,Schindler's List (1993),5
9,514,Multiplicity (1996),3


In [37]:
dls_item_test.x_names

(#2) ['user','movie']

In [38]:
dls.x_names

(#2) ['user','title']

In [54]:
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

# generate intial user and movie factors matrices
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)
user_factors_pb = torch.randn(n_users, 5)

In [50]:
n_users, n_movies

(944, 1665)

In [55]:
user_factors, user_factors.shape

(tensor([[-0.5617,  0.4572, -0.3097, -0.2996,  1.4585],
         [ 1.2071, -1.2319, -0.5017, -0.0137,  0.5625],
         [-1.0408,  0.2596, -1.4479,  0.1861,  0.5044],
         ...,
         [-1.4306, -0.3952,  1.5358, -0.0423,  0.4700],
         [ 1.7414,  1.0096,  0.5727,  1.0176, -1.8337],
         [-0.4501, -0.2672, -0.6809,  0.8564,  0.0702]]),
 torch.Size([944, 5]))