# LightFM Model on Lfm-b2 dataset

In [1]:
import numpy as np
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask_ml.model_selection import train_test_split
import pickle

In [2]:
users = dd.read_csv('lfm-b2/users.tsv', sep='\t')
users.count().compute()

user_id          120322
country           55186
age              120322
gender           120110
creation_time    120322
dtype: int64

In [3]:
users = users.sample(frac=0.001, random_state=42)
user_ids = users['user_id'].to_dask_array(True)

In [4]:
listening_counts = dd.read_csv('lfm-b2/listening-counts.tsv', sep='\t')
listening_counts = listening_counts[listening_counts['user_id'].isin(user_ids)]

In [5]:
dataset = Dataset()
dataset.fit(listening_counts.user_id, listening_counts.track_id)

In [6]:
test_ds = Dataset()
test_ds.fit(listening_counts.user_id, listening_counts.track_id)

In [7]:
del listening_counts

In [8]:
listening_events = dd.read_csv('lfm-b2/listening-events.tsv', sep='\t')
listening_events = listening_events[listening_events['user_id'].isin(user_ids)]

In [9]:
le_train, le_test = train_test_split(listening_events, shuffle=False, test_size=0.20, random_state=42)

In [10]:
del listening_events

In [11]:
(test_interactions, test_weights) = test_ds.build_interactions(((e[0], e[1]) for e in zip(le_test.user_id, le_test.track_id)))
print(repr(test_interactions))

<120x309395 sparse matrix of type '<class 'numpy.int32'>'
	with 276071 stored elements in COOrdinate format>


In [14]:
(interactions, weights) = test_ds.build_interactions(((e[0], e[1]) for e in zip(le_train.user_id, le_train.track_id)))
print(repr(interactions))

<120x309395 sparse matrix of type '<class 'numpy.int32'>'
	with 276071 stored elements in COOrdinate format>


In [15]:
# user_features = dataset.build_user_features((u.user_id, [u.country, u.age, u.gender]) for u in users)
# print(repr(user_features))

In [16]:
from lightfm import LightFM
model = LightFM(loss='warp')
# model.fit(interactions, item_features=item_features)
model.fit(interactions)

<lightfm.lightfm.LightFM at 0x7f7e248613a0>

In [17]:
with open('model-train-test.pickle', 'wb') as handle:
    pickle.dump(model, handle)

In [26]:
auc_score(model, test_interactions, num_threads=2).mean()

0.6870096

In [27]:
precision_at_k(model, test_interactions, num_threads=2).mean()

0.1075

In [28]:
recall_at_k(model, test_interactions, num_threads=2).mean()

0.0016217656160171993

In [29]:
reciprocal_rank(model, test_interactions, num_threads=2).mean()

0.20356373