In [1]:
from fastai.collab import *
from fastai.tabular import *

## Collaborative filtering example

`collab` models use data in a `DataFrame` of user, items, and ratings.

In [2]:
user,item,title = 'userId','movieId','title'

In [3]:
path = untar_data(URLs.ML_SAMPLE)
path

PosixPath('/home/ubuntu/.fastai/data/movie_lens_sample')

In [4]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


That's all we need to create and train a model:

In [5]:
data = CollabDataBunch.from_df(ratings, seed=42)

In [6]:
y_range = [0,5.5]

In [10]:
#collab_learner??
learn = collab_learner??(data, n_factors=50, y_range=y_range)

In [12]:
learn.fit_one_cycle??
#learn.fit_one_cycle(3, 5e-3)

## Movielens 100k

Let's try with the full Movielens 100k data dataset, available from http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [19]:
Config.data_path()
#path=Config.data_path()/'ml-100k'

PosixPath('/home/ubuntu/.fastai/data')

In [20]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=[user,item,'rating','timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [21]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1', header=None,
                    names=[item, 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]])
movies.head()

Unnamed: 0,movieId,title,date,N,url,g0,g1,g2,g3,g4,...,g9,g10,g11,g12,g13,g14,g15,g16,g17,g18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
len(ratings)

100000

In [23]:
rating_movie = ratings.merge(movies[[item, title]])
rating_movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [24]:
data = CollabDataBunch.from_df(rating_movie, seed=42, valid_pct=0.1, item_name=title)

In [25]:
data.show_batch()

userId,title,target
606,Matilda (1996),4.0
660,Army of Darkness (1993),3.0
90,Great Expectations (1998),4.0
738,Titanic (1997),5.0
308,Kalifornia (1993),4.0


In [26]:
y_range = [0,5.5]

In [27]:
learn = collab_learner(data, n_factors=40, y_range=y_range, wd=1e-1)

In [38]:
learn.lr_find()
# learn.recorder.plot(skip_end=15)
learn.recorder.get_state

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


<bound method Callback.get_state of Recorder
learn: CollabLearner(data=TabularDataBunch;

Train: LabelList (90000 items)
x: CollabList
userId 196; title Kolya (1996); ,userId 63; title Kolya (1996); ,userId 226; title Kolya (1996); ,userId 154; title Kolya (1996); ,userId 306; title Kolya (1996); 
y: FloatList
3.0,3.0,5.0,3.0,5.0
Path: .;

Valid: LabelList (10000 items)
x: CollabList
userId 498; title Casino (1995); ,userId 642; title Pocahontas (1995); ,userId 58; title 2001: A Space Odyssey (1968); ,userId 495; title Cat People (1982); ,userId 618; title Philadelphia (1993); 
y: FloatList
3.0,5.0,4.0,3.0,3.0
Path: .;

Test: None, model=EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1654, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1654, 1)
), opt_func=functools.partial(<class 'torch.optim.adam.Adam'>, betas=(0.9, 0.99)), loss_func=FlattenedLoss of MSELoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.1, train_bn=True, path=PosixPath('.'), model

In [39]:
learn.fit_one_cycle(5, 5e-3)

epoch,train_loss,valid_loss,time
0,0.983908,0.960738,00:10
1,0.871353,0.885311,00:11
2,0.774986,0.827257,00:10
3,0.678376,0.808172,00:11
4,0.552999,0.807097,00:11


In [41]:
learn.save('dotprod')

Here's [some benchmarks](https://www.librec.net/release/v1.3/example.html) on the same dataset for the popular Librec system for collaborative filtering. They show best results based on RMSE of 0.91, which corresponds to an MSE of `0.91**2 = 0.83`.

## Interpretation

### Setup

In [42]:
learn.load('dotprod');



In [43]:
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1654, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1654, 1)
)

In [47]:
g = rating_movie.groupby(title)['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]

array(['Star Wars (1977)', 'Contact (1997)', 'Fargo (1996)', 'Return of the Jedi (1983)', 'Liar Liar (1997)',
       'English Patient, The (1996)', 'Scream (1996)', 'Toy Story (1995)', 'Air Force One (1997)',
       'Independence Day (ID4) (1996)'], dtype=object)

### Movie bias

In [48]:
movie_bias = learn.bias(top_movies, is_item=True)
movie_bias.shape

torch.Size([1000])

In [49]:
mean_ratings = rating_movie.groupby(title)['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]

In [50]:
item0 = lambda o:o[0]

In [51]:
sorted(movie_ratings, key=item0)[:15]

[(tensor(-0.3601),
  'Children of the Corn: The Gathering (1996)',
  1.3157894736842106),
 (tensor(-0.3185),
  'Lawnmower Man 2: Beyond Cyberspace (1996)',
  1.7142857142857142),
 (tensor(-0.2639), 'Cable Guy, The (1996)', 2.339622641509434),
 (tensor(-0.2613), 'Mortal Kombat: Annihilation (1997)', 1.9534883720930232),
 (tensor(-0.2602), 'Crow: City of Angels, The (1996)', 1.9487179487179487),
 (tensor(-0.2585), 'Barb Wire (1996)', 1.9333333333333333),
 (tensor(-0.2562), 'Striptease (1996)', 2.2388059701492535),
 (tensor(-0.2455), 'Bio-Dome (1996)', 1.903225806451613),
 (tensor(-0.2371), "Stephen King's The Langoliers (1995)", 2.413793103448276),
 (tensor(-0.2197), 'Island of Dr. Moreau, The (1996)', 2.1578947368421053),
 (tensor(-0.2167), 'Tales from the Hood (1995)', 2.037037037037037),
 (tensor(-0.2157), 'Showgirls (1995)', 1.9565217391304348),
 (tensor(-0.2139), 'Lawnmower Man, The (1992)', 2.4461538461538463),
 (tensor(-0.2081), "McHale's Navy (1997)", 2.1884057971014492),
 (tenso

In [52]:
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]

[(tensor(0.6299), "Schindler's List (1993)", 4.466442953020135),
 (tensor(0.5809), 'Shawshank Redemption, The (1994)', 4.445229681978798),
 (tensor(0.5663), 'Silence of the Lambs, The (1991)', 4.28974358974359),
 (tensor(0.5505), 'L.A. Confidential (1997)', 4.161616161616162),
 (tensor(0.5484), 'Titanic (1997)', 4.2457142857142856),
 (tensor(0.5323), 'Good Will Hunting (1997)', 4.262626262626263),
 (tensor(0.5264), 'Rear Window (1954)', 4.3875598086124405),
 (tensor(0.5256), 'Star Wars (1977)', 4.3584905660377355),
 (tensor(0.5197), 'As Good As It Gets (1997)', 4.196428571428571),
 (tensor(0.5064), 'Boot, Das (1981)', 4.203980099502488),
 (tensor(0.5039), 'Casablanca (1942)', 4.45679012345679),
 (tensor(0.4803), 'Close Shave, A (1995)', 4.491071428571429),
 (tensor(0.4798), 'Apt Pupil (1998)', 4.1),
 (tensor(0.4796), 'Godfather, The (1972)', 4.283292978208232),
 (tensor(0.4723), 'Usual Suspects, The (1995)', 4.385767790262173)]

### Movie weights

In [53]:
movie_w = learn.weight(top_movies, is_item=True)
movie_w.shape

torch.Size([1000, 40])

In [54]:
movie_pca = movie_w.pca(3)
movie_pca.shape

torch.Size([1000, 3])

In [59]:
fac0,fac1,fac2 = movie_pca.t()
movie_comp = [(f, i) for f,i in zip(fac0, top_movies)]

In [60]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

[(tensor(1.1305), 'Wrong Trousers, The (1993)'),
 (tensor(1.1112), 'Close Shave, A (1995)'),
 (tensor(1.0685), 'Chinatown (1974)'),
 (tensor(1.0279), 'Casablanca (1942)'),
 (tensor(1.0268), 'When We Were Kings (1996)'),
 (tensor(0.9937), 'Lawrence of Arabia (1962)'),
 (tensor(0.9837), 'Apocalypse Now (1979)'),
 (tensor(0.9743), 'Third Man, The (1949)'),
 (tensor(0.9687), 'Ran (1985)'),
 (tensor(0.9619), 'Godfather, The (1972)')]

In [61]:
sorted(movie_comp, key=itemgetter(0))[:10]

[(tensor(-1.2888), "McHale's Navy (1997)"),
 (tensor(-1.2278), 'Home Alone 3 (1997)'),
 (tensor(-1.2021), 'Jungle2Jungle (1997)'),
 (tensor(-1.1710), 'Flipper (1996)'),
 (tensor(-1.1410), 'D3: The Mighty Ducks (1996)'),
 (tensor(-1.1071), 'Free Willy 3: The Rescue (1997)'),
 (tensor(-1.1042), 'Leave It to Beaver (1997)'),
 (tensor(-1.1040), 'Grease 2 (1982)'),
 (tensor(-1.0991), 'Batman & Robin (1997)'),
 (tensor(-1.0979), 'Children of the Corn: The Gathering (1996)')]

In [62]:
movie_comp = [(f, i) for f,i in zip(fac1, top_movies)]

In [63]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

[(tensor(0.8878), 'Ready to Wear (Pret-A-Porter) (1994)'),
 (tensor(0.8525), 'Trainspotting (1996)'),
 (tensor(0.8282), 'Dead Man (1995)'),
 (tensor(0.8025), 'Keys to Tulsa (1997)'),
 (tensor(0.7966), 'Beavis and Butt-head Do America (1996)'),
 (tensor(0.7938), 'Lost Highway (1997)'),
 (tensor(0.7764), 'Clockwork Orange, A (1971)'),
 (tensor(0.7596), 'Brazil (1985)'),
 (tensor(0.7590), 'Stupids, The (1996)'),
 (tensor(0.7578), 'Jude (1996)')]

In [64]:
sorted(movie_comp, key=itemgetter(0))[:10]

[(tensor(-1.1330), 'Braveheart (1995)'),
 (tensor(-1.0290), 'Raiders of the Lost Ark (1981)'),
 (tensor(-1.0011), 'Titanic (1997)'),
 (tensor(-0.8846), 'American President, The (1995)'),
 (tensor(-0.8659), 'Sleepless in Seattle (1993)'),
 (tensor(-0.8621), 'Return of the Jedi (1983)'),
 (tensor(-0.8566), 'Star Wars (1977)'),
 (tensor(-0.8497), 'Independence Day (ID4) (1996)'),
 (tensor(-0.8489), 'Back to the Future (1985)'),
 (tensor(-0.8431), "It's a Wonderful Life (1946)")]

In [65]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()