In [1]:
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens



In [2]:
movielens = fetch_movielens()
movielens.keys()

dict_keys(['train', 'test', 'item_features', 'item_feature_labels', 'item_labels'])

The lightfm library provides out of the box a train/test split, which can be accessed through:

In [3]:
train = movielens['train']
test = movielens['test']
item_features = movielens['item_features']
item_feature_labels = movielens['item_feature_labels']
item_labels = movielens['item_labels']

There are 943 users, 1682 movies.

The train set has 90570 ratings and the test set has 9430.

These elements are stored as sparse matrices due to the large numbers of 0s (combinations of users/movies without ratings)

In [4]:
train

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 90570 stored elements in COOrdinate format>

In [5]:
test

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 9430 stored elements in COOrdinate format>

In [6]:
print(train.shape)
print(test.shape)

(943, 1682)
(943, 1682)


In [7]:
# Inspecting the data format in the dense format

In [8]:
pd.DataFrame(train.todense()).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(pd.DataFrame(item_features.todense()).head())
print(pd.DataFrame(item_feature_labels).head())
print(pd.DataFrame(item_labels).head())

   0     1     2     3     4     5     6     7     8     9     ...  1672  \
0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
1   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4   0.0   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   1673  1674  1675  1676  1677  1678  1679  1680  1681  
0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 1682 columns]
                   0
0   Toy Story (1995)
1   GoldenEye (1995)
2  Four Rooms (1995)
3  Get Shorty (1995)
4     Copycat (1995)
                   0
0   Toy Story (1995)
1 

In [10]:
print(movielens['item_features'])

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 8)	1.0
  (9, 9)	1.0
  (10, 10)	1.0
  (11, 11)	1.0
  (12, 12)	1.0
  (13, 13)	1.0
  (14, 14)	1.0
  (15, 15)	1.0
  (16, 16)	1.0
  (17, 17)	1.0
  (18, 18)	1.0
  (19, 19)	1.0
  (20, 20)	1.0
  (21, 21)	1.0
  (22, 22)	1.0
  (23, 23)	1.0
  (24, 24)	1.0
  :	:
  (1657, 1657)	1.0
  (1658, 1658)	1.0
  (1659, 1659)	1.0
  (1660, 1660)	1.0
  (1661, 1661)	1.0
  (1662, 1662)	1.0
  (1663, 1663)	1.0
  (1664, 1664)	1.0
  (1665, 1665)	1.0
  (1666, 1666)	1.0
  (1667, 1667)	1.0
  (1668, 1668)	1.0
  (1669, 1669)	1.0
  (1670, 1670)	1.0
  (1671, 1671)	1.0
  (1672, 1672)	1.0
  (1673, 1673)	1.0
  (1674, 1674)	1.0
  (1675, 1675)	1.0
  (1676, 1676)	1.0
  (1677, 1677)	1.0
  (1678, 1678)	1.0
  (1679, 1679)	1.0
  (1680, 1680)	1.0
  (1681, 1681)	1.0


In [11]:
print(movielens['item_feature_labels'])

['Toy Story (1995)' 'GoldenEye (1995)' 'Four Rooms (1995)' ...
 'Sliding Doors (1998)' 'You So Crazy (1994)'
 'Scream of Stone (Schrei aus Stein) (1991)']


In [12]:
# Exploratory data analysis

Potential Questions:
1-Rating Distribution
2-Number of Ratings per User
3-Number of Rating per Movie

SyntaxError: invalid syntax (3823774515.py, line 3)

# Model

In [15]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.61, test 0.20.
AUC: train 0.90, test 0.88.
