In [1]:
import numpy as np
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.cross_validation import random_train_test_split
from lightfm import LightFM
import pandas as pd
import dask.dataframe as dd
import pickle
import csv

num_threads = 12
test_percentage=0.25
epochs=2
seed = 42
# Remove users that have listened to less than min_tracks_for_user tracks
min_tracks_for_user = 200
# Remove tracks that have been listened by less than min_listenings_for_track users
min_listenings_for_track = 2000
base_file_name = 'lightfm-filtered-split-userfeatures2'
tracks_path = 'lfm-b2/tracks.tsv'
users_path = 'lfm-b2/users.tsv'

In [2]:
users = pd.read_csv(users_path, sep='\t')
users

Unnamed: 0,user_id,country,age,gender,creation_time
0,0,UK,31,m,2002-12-28 01:00:00
1,1,US,43,m,2003-04-15 02:00:00
2,2,UK,35,m,2002-10-29 01:00:00
3,3,BR,31,m,2003-07-20 02:00:00
4,4,,51,m,2003-07-21 02:00:00
...,...,...,...,...,...
120317,120317,RU,19,m,2012-07-21 18:15:05
120318,120318,,-1,n,2012-07-23 21:17:42
120319,120319,BY,19,f,2012-07-19 22:07:27
120320,120320,,-1,m,2012-07-23 19:25:08


In [3]:
genders = users['gender'].unique()
genders

array(['m', 'n', nan, 'f'], dtype=object)

In [4]:
users['gender'] = users['gender'].fillna('n')

In [5]:
users['gender'].unique()

array(['m', 'n', 'f'], dtype=object)

In [6]:
countries = users['country'].dropna().unique()
countries

array(['UK', 'US', 'BR', 'ES', 'AT', 'IT', 'AU', 'NL', 'IE', 'DE', 'CA',
       'DK', 'RU', 'CH', 'JP', 'SE', 'FI', 'BE', 'NO', 'AQ', 'HU', 'CZ',
       'KH', 'LV', 'IL', 'NZ', 'ZA', 'UA', 'PL', 'FR', 'CC', 'RS', 'MX',
       'IS', 'BG', 'PT', 'ST', 'AR', 'NI', 'KY', 'SI', 'LU', 'UM', 'TW',
       'MG', 'CX', 'SJ', 'TR', 'AZ', 'TH', 'CL', 'LB', 'EE', 'AF', 'LI',
       'BY', 'HR', 'LT', 'GR', 'CR', 'IN', 'ME', 'NR', 'FM', 'BF', 'PE',
       'AE', 'SK', 'NU', 'ID', 'CO', 'PH', 'RO', 'CY', 'HM', 'SG', 'VE',
       'KI', 'RE', 'MK', 'MY', 'VA', 'KR', 'MW', 'KZ', 'HT', 'VN', 'KP',
       'MU', 'HK', 'CN', 'TT', 'AS', 'EH', 'NC', 'DZ', 'PA', 'BB', 'OM',
       'EG', 'GU', 'GT', 'IR', 'IO', 'PS', 'MV', 'JO', 'ZW', 'GH', 'UY',
       'PR', 'FJ', 'BA', 'DO', 'GS', 'TJ', 'VG', 'BW', 'WS', 'VI', 'AD',
       'PF', 'AM', 'GE', 'MT', 'BO', 'CU', 'KG', 'BN', 'UZ', 'TV', 'CI',
       'AL', 'BD', 'SC', 'TG', 'EC', 'LA', 'SV', 'VU', 'HN', 'MD', 'GP',
       'BI', 'TN', 'KE', 'ZM', 'LK', 'BT', 'SR', 'J

In [7]:
tracks = pd.read_csv(
    tracks_path,
    sep='\t',
    na_values='',
    na_filter=False,
    error_bad_lines=False,
    quoting=csv.QUOTE_NONE,
)
tracks



  tracks = pd.read_csv(


Unnamed: 0,track_id,artist_name,track_name
0,0,Mika Miko,
1,1,Comm,
2,2,deadmau5,
3,3,Rasheed Chappell,
4,4,rck,
...,...,...,...
50813368,50813368,ЗупиниЛося,"🦌 ЗупиниЛося №60. Бійки, стрілянина і табуни д..."
50813369,50813369,ЗупиниЛося,🦌 ЗупиниЛося №69. Повний паркувальний бардак н...
50813370,50813370,Lost Cascades,🧝‍♀️ 𝒜 𝒮𝓉𝓇𝒶𝓃𝑔𝑒𝓇 𝒜𝓅𝓅𝓇𝑜𝒶𝒸𝒽𝑒𝓈: '𝒟𝓇𝒶𝓊⻕
50813371,50813371,Lost Cascades,🧝♀️ 𝒜 𝒮𝓉𝓇𝒶𝓃𝑔𝑒𝓇 𝒜𝓅𝓅𝓇𝑜𝒶𝒸𝒽𝑒𝓈: '𝒟𝓇𝒶𝓊𝑔𝒽𝓉 𝑜𝒻 𝐵𝓁𝑒𝓈𝓈𝑒𝒹...


In [8]:
listening_counts = pd.read_csv('lfm-b2/listening-counts.tsv', sep='\t')
listening_counts

Unnamed: 0,user_id,track_id,count
0,92915,26719936,1
1,92915,4271407,1
2,92915,4606511,1
3,92915,8966085,1
4,92915,12290897,1
...,...,...,...
519293328,119080,44144633,1
519293329,119080,45141923,1
519293330,119080,46171716,1
519293331,119080,46245572,1


In [9]:
listening_count_sizes_by_user = listening_counts.groupby('user_id').size().to_frame('size')
listening_count_sizes_by_user

Unnamed: 0_level_0,size
user_id,Unnamed: 1_level_1
0,17612
1,775
2,8658
3,9435
4,15471
...,...
120317,3970
120318,3559
120319,14021
120320,676


In [10]:
listening_count_sizes_by_user[listening_count_sizes_by_user['size'] < min_tracks_for_user]

Unnamed: 0_level_0,size
user_id,Unnamed: 1_level_1
85,45
149,188
391,160
451,90
472,170
...,...
120305,37
120308,84
120311,69
120312,36


In [11]:
users_with_enough_interactions = users.filter(
    listening_count_sizes_by_user[listening_count_sizes_by_user['size'] >= min_tracks_for_user].index,
    axis='index',
)
users_with_enough_interactions

Unnamed: 0,user_id,country,age,gender,creation_time
0,0,UK,31,m,2002-12-28 01:00:00
1,1,US,43,m,2003-04-15 02:00:00
2,2,UK,35,m,2002-10-29 01:00:00
3,3,BR,31,m,2003-07-20 02:00:00
4,4,,51,m,2003-07-21 02:00:00
...,...,...,...,...,...
120317,120317,RU,19,m,2012-07-21 18:15:05
120318,120318,,-1,n,2012-07-23 21:17:42
120319,120319,BY,19,f,2012-07-19 22:07:27
120320,120320,,-1,m,2012-07-23 19:25:08


In [12]:
listening_count_sizes_by_track = listening_counts.groupby('track_id').size().to_frame('size')
listening_count_sizes_by_track

Unnamed: 0_level_0,size
track_id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
...,...
50813368,1
50813369,1
50813370,2
50813371,1


In [13]:
listening_count_sizes_by_track[listening_count_sizes_by_track['size'] < min_listenings_for_track]

Unnamed: 0_level_0,size
track_id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
...,...
50813368,1
50813369,1
50813370,2
50813371,1


In [14]:
tracks_with_enough_interactions = tracks.filter(
    listening_count_sizes_by_track[listening_count_sizes_by_track['size'] >= min_listenings_for_track].index,
    axis='index',
)
tracks_with_enough_interactions

Unnamed: 0,track_id,artist_name,track_name
4774,4774,The Gaslight Anthem,"""45"""
24587,24587,David Bowie,"""Heroes"""
53266,53266,Fall Out Boy,"""The Take Over, The Breaks Over"""
60677,60677,Queens of the Stone Age,"""You Got a Killer Scene There, Man..."""
74470,74470,Animal Collective,#1
...,...,...,...
48014274,48014274,Ólafur Arnalds,Þú ert sólin
48333764,48333764,Сплин,Выхода нет
49707344,49707344,Queens of the Stone Age,…Like Clockwork
49721633,49721633,Baths,♥


In [15]:
selected_listening_counts = listening_counts[listening_counts['user_id'].isin(users_with_enough_interactions.user_id)]
selected_listening_counts = selected_listening_counts[selected_listening_counts['track_id'].isin(tracks_with_enough_interactions.track_id)]
selected_listening_counts

Unnamed: 0,user_id,track_id,count
1,92915,4271407,1
2,92915,4606511,1
5,92915,19127241,1
6,92915,24076341,1
24,92915,20415914,1
...,...,...,...
519292335,119080,28543631,1
519292626,119080,12440840,1
519292741,119080,28127652,1
519293157,119080,6956607,1


In [16]:
del listening_counts
del tracks_with_enough_interactions
del listening_count_sizes_by_track
del listening_count_sizes_by_user
del users
del tracks

In [17]:
all_user_features = [
  *countries,
  *genders,
]

In [18]:
dataset = Dataset()
%time
dataset.fit(selected_listening_counts.user_id, selected_listening_counts.track_id, user_features=all_user_features)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [19]:
(interactions, weights) = dataset.build_interactions(selected_listening_counts.itertuples(False, None))

In [20]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, Num items {}.'.format(num_users, num_items))

Num users: 107949, Num items 20098.


In [21]:
# after filtering out tracks, some users have no more interactions and have to be removed
users_in_dataset = users_with_enough_interactions[users_with_enough_interactions['user_id'].isin(selected_listening_counts.user_id)]
user_features = dataset.build_user_features([(x[0], [x[1], x[3]]) for x in users_in_dataset.itertuples(False, None)])

In [22]:
del users_with_enough_interactions
del selected_listening_counts

In [23]:
(train_interactions, test_interactions) = random_train_test_split(
    interactions,
    test_percentage=test_percentage,
    random_state=seed,
)

(train_weights, test_weights) = random_train_test_split(
    weights,
    test_percentage=test_percentage,
    random_state=seed,
)

In [24]:
model = LightFM(
    loss='warp',
    random_state=seed,
)
model.fit(
    train_interactions,
    user_features=user_features,
    sample_weight=train_weights,
    epochs=epochs,
    num_threads=num_threads,
    verbose=True,
)

Epoch: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:06<00:00, 33.45s/it]


<lightfm.lightfm.LightFM at 0x7fe96ce001f0>

In [25]:
def dump(var, name):
    with open(f'./pickle/{base_file_name}.{name}.pickle', 'wb') as handle:
        pickle.dump(var, handle)

dump(model, 'model')
dump(test_interactions, 'test_interactions')
dump(train_interactions, 'train_interactions')
dump(user_features, 'user_features')

In [26]:
%%time
recall_at_k(
    model,
    test_interactions,
    train_interactions,
    k=3,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 35s, sys: 1.42 s, total: 19min 36s
Wall time: 2min 9s


0.004211642693356601

In [27]:
%%time
recall_at_k(
    model,
    test_interactions,
    train_interactions,
    k=5,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 39s, sys: 1.6 s, total: 19min 41s
Wall time: 2min 10s


0.006325640825753525

In [28]:
%%time
recall_at_k(
    model,
    test_interactions,
    train_interactions,
    k=10,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 43s, sys: 2.19 s, total: 19min 45s
Wall time: 2min 8s


0.010928274023149637

In [29]:
%%time
precision_at_k(
    model,
    test_interactions,
    train_interactions,
    k=3,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 25s, sys: 2.15 s, total: 19min 28s
Wall time: 2min 7s


0.20164616

In [30]:
%%time
precision_at_k(
    model,
    test_interactions,
    train_interactions,
    k=5,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 36s, sys: 2.25 s, total: 19min 38s
Wall time: 2min 7s


0.18863027

In [31]:
%%time
precision_at_k(
    model,
    test_interactions,
    train_interactions,
    k=10,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 30s, sys: 1.97 s, total: 19min 32s
Wall time: 2min


0.17024226

In [32]:
%%time
reciprocal_rank(
    model,
    test_interactions,
    train_interactions,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 29s, sys: 2.48 s, total: 19min 32s
Wall time: 2min 9s


0.3338196

In [33]:
%%time
auc_score(
    model,
    test_interactions,
    train_interactions,
    user_features=user_features,
    num_threads=num_threads,
).mean()

CPU times: user 19min 36s, sys: 1.84 s, total: 19min 38s
Wall time: 2min 9s


0.7942246