In [1]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.11.tar.gz (244kB)
[K    100% |████████████████████████████████| 245kB 1.8MB/s ta 0:00:01K    8% |██▊                             | 20kB 1.4MB/s eta 0:00:01[K    16% |█████▍                          | 40kB 1.3MB/s eta 0:00:01[K    25% |████████                        | 61kB 1.7MB/s eta 0:00:01[K    37% |████████████                    | 92kB 2.0MB/s eta 0:00:01[K    46% |██████████████▊                 | 112kB 3.0MB/s eta 0:00:01[K    54% |█████████████████▍              | 133kB 2.6MB/s eta 0:00:01[K    62% |████████████████████            | 153kB 4.0MB/s eta 0:00:01[K    71% |██████████████████████▉         | 174kB 4.3MB/s eta 0:00:01[K    79% |█████████████████████████▌      | 194kB 6.8MB/s eta 0:00:01[K    87% |████████████████████████████▏   | 215kB 5.5MB/s eta 0:00:01[K    96% |██████████████████████████████▉ | 235kB 5.9MB/s eta 0:00:01
Collecting scipy>=0.17.0 (from lightfm)
  Using cached scipy-0.18.1-cp27-cp27m-macosx_10_6_in

In [2]:

# Let's load in our new dataset from lightfm. This data comes directly from Netflix's $1 million dollar challenge. 
# Don't get too excited. Someone already won the money import numpy as np
from lightfm.datasets import fetch_movielens

movielens = fetch_movielens()



In [3]:
# What's in here?

for key, value in movielens.items():
    print(key, type(value), value.shape)
    
# Looks like things have already been split for us. Convenient!

('test', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
('item_features', <class 'scipy.sparse.csr.csr_matrix'>, (1682, 1682))
('train', <class 'scipy.sparse.coo.coo_matrix'>, (943, 1682))
('item_labels', <type 'numpy.ndarray'>, (1682,))
('item_feature_labels', <type 'numpy.ndarray'>, (1682,))


In [4]:
# Let's take that data out

train = movielens['train']
test = movielens['test']

Precision at k how many movie guesses do i have to match before i satrt recommendations

In [7]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

# What's our baseline?
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.59, test 0.10.
AUC: train 0.89, test 0.86.


In [6]:
model = LightFM(learning_rate=0.05, loss='warp')

model.fit_partial(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.61, test 0.11.
AUC: train 0.94, test 0.90.


In [8]:
import pandas as pd

# Let's look into the sparse matrix and see what's going on
def iterateMovies(user, count):
    for idx, x in enumerate(user):
        if x > 0 and count > 0:
            print movielens['item_labels'][idx] , x
            count -= 1

user0 = movielens['train'].toarray()[0]

#There's a lot of movies in there
print len(user0)
print

#Let's see the first five moveies this user rated ...
iterateMovies(user0, 5)


1682

Toy Story (1995) 5.0
GoldenEye (1995) 3.0
Four Rooms (1995) 4.0
Get Shorty (1995) 3.0
Copycat (1995) 3.0


In [9]:
# This guy's got good taste. Let's see what the systems recommends for him
def sample_recommendation(model, data, user_ids):

    n_users, n_items = data['train'].shape

    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

sample_recommendation(model, movielens, [0])

# Notice I left the ids open here. Feel free to try to with any of the other users. 
# You can even rate things yourself if you'd like

User 0
     Known positives:
        Toy Story (1995)
        GoldenEye (1995)
        Four Rooms (1995)
     Recommended:
        Raiders of the Lost Ark (1981)
        Silence of the Lambs, The (1991)
        Pulp Fiction (1994)
