In [1]:
import numpy as np
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask_ml.model_selection import train_test_split
import pickle

In [2]:
users = dd.read_csv('lfm-b2/users.tsv', sep='\t')
users.count().compute()

user_id          120322
country           55186
age              120322
gender           120110
creation_time    120322
dtype: int64

In [3]:
users = users.sample(frac=0.001, random_state=42)
user_ids = users['user_id'].to_dask_array(True)

In [4]:
listening_counts = dd.read_csv('lfm-b2/listening-counts.tsv', sep='\t')
listening_counts = listening_counts[listening_counts['user_id'].isin(user_ids)]

In [17]:
countries = users['country'].unique()
countries.compute()

0      US
1     NaN
2      UA
3      RU
4      ES
5      JP
6      IT
7      DE
8      FR
9      UK
10     NO
11     SE
12     AU
13     NL
14     RO
15     PL
16     IR
17     TR
18     CA
19     BR
Name: country, dtype: object

In [18]:
genders = users['gender'].unique()
genders.compute()

0      m
1      n
2      f
3    NaN
Name: gender, dtype: object

In [19]:
users['gender'] = users['gender'].fillna('n')

In [20]:
users['gender'].unique().compute()

0    m
1    n
2    f
Name: gender, dtype: object

In [21]:
dataset = Dataset()
all_user_features = [
  *countries,
  *genders,
]
dataset.fit(listening_counts.user_id, listening_counts.track_id, user_features=all_user_features)

In [22]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 120, num_items 309395.


In [23]:
listening_counts.head()

Unnamed: 0,user_id,track_id,count
1749589,20981,17852200,3
1749590,20981,47207788,1
1749591,20981,25391108,1
1749592,20981,37278939,1
1749593,20981,40530567,1


In [None]:
(interactions, weights) = dataset.build_interactions(listening_counts.itertuples(False, None))

In [None]:
users.head()

In [None]:
user_features = dataset.build_user_features([(x[0], [x[1], x[3]]) for x in users.itertuples(False, None)])

In [None]:
from lightfm import LightFM
model = LightFM('warp')
model.fit(interactions, user_features=user_features)