In [1]:
import numpy as np
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask_ml.model_selection import train_test_split
import pickle

In [2]:
users = dd.read_csv('lfm-b2/users.tsv', sep='\t')
users.count().compute()

user_id          120322
country           55186
age              120322
gender           120110
creation_time    120322
dtype: int64

In [3]:
users = users.sample(frac=0.002, random_state=42)
user_ids = users['user_id'].to_dask_array(True)

In [4]:
users.head()

Unnamed: 0,user_id,country,age,gender,creation_time
113847,113847,US,44,m,2012-05-07 16:17:46
116210,116210,,-1,n,2012-05-13 15:42:46
92895,92895,,-1,n,2012-03-02 05:15:09
65264,65264,,-1,n,2011-05-10 21:49:13
91135,91135,UA,-1,m,2012-02-22 12:38:47


In [5]:
genders = users['gender'].unique()
genders.compute()

0      m
1      n
2      f
3    NaN
Name: gender, dtype: object

In [6]:
users['gender'] = users['gender'].fillna('n')

In [7]:
users['gender'].unique().compute()

0    m
1    n
2    f
Name: gender, dtype: object

In [8]:
countries = users['country'].unique()
countries.compute()

0      US
1     NaN
2      UA
3      RU
4      ES
5      JP
6      IT
7      DE
8      FR
9      UK
10     NO
11     SE
12     AU
13     NL
14     RO
15     PL
16     IR
17     TR
18     CA
19     BR
20     DK
21     BE
22     FI
23     TV
24     MX
25     CZ
26     ID
27     ME
Name: country, dtype: object

In [9]:
listening_counts = dd.read_csv('lfm-b2/listening-counts.tsv', sep='\t')
listening_counts = listening_counts[listening_counts['user_id'].isin(user_ids)]

In [10]:
listening_counts.head()

Unnamed: 0,user_id,track_id,count
1749589,20981,17852200,3
1749590,20981,47207788,1
1749591,20981,25391108,1
1749592,20981,37278939,1
1749593,20981,40530567,1


In [11]:
all_user_features = [
  *countries,
  *genders,
]

In [12]:
dataset = Dataset()
dataset.fit(listening_counts.user_id, listening_counts.track_id, user_features=all_user_features)

In [13]:
(interactions, weights) = dataset.build_interactions(listening_counts.itertuples(False, None))

In [14]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, Num items {}.'.format(num_users, num_items))

Num users: 241, Num items 597086.


In [15]:
user_features = dataset.build_user_features([(x[0], [x[1], x[3]]) for x in users.itertuples(False, None)])

In [16]:
(train_interactions, test_interactions) = random_train_test_split(interactions, random_state=42)

In [17]:
model = LightFM(loss='warp')
model.fit(train_interactions, user_features=user_features)

<lightfm.lightfm.LightFM at 0x7f1f6f0b3850>

In [18]:
auc_score(model, test_interactions, train_interactions, user_features=user_features, num_threads=2).mean()

0.5026315

In [19]:
precision_at_k(model, test_interactions, train_interactions, user_features=user_features, num_threads=2).mean()

0.055186722

In [20]:
recall_at_k(model, test_interactions, train_interactions, user_features=user_features, num_threads=2).mean()

0.0022587719603890037

In [21]:
reciprocal_rank(model, test_interactions, train_interactions, user_features=user_features, num_threads=2).mean()

0.1634656