# Эксперименты с LightFM

In [1]:
import warnings

warnings.filterwarnings("ignore")

import lightgbm
import matplotlib.pyplot as plt
%matplotlib notebook
import numpy as np
import pandas as pd
import plotly.express as px
import scipy
import seaborn as sns
from lightfm import LightFM
from scipy.sparse import coo_matrix, csr_matrix, save_npz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from pandas.api.types import CategoricalDtype
from scipy import sparse



import pickle

## Подготовка данных

Загружаем датасет

In [2]:
df = pd.read_csv('spotify_dataset.csv.zip', skiprows=1,
                 usecols = [0, 1, 2],
                 names=['user_id', 'artistname', 'trackname'],
                 on_bad_lines='skip')
df.head(2)

Unnamed: 0,user_id,artistname,trackname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders..."


Собираем пары артист_трек

In [3]:
df['artistname'].fillna('', inplace=True)
df['trackname'].fillna('', inplace=True)

df['song'] = df['artistname'] + '__' + df['trackname']
df.drop(columns=['artistname', 'trackname'], inplace = True)

In [4]:
df.shape

(12901979, 2)

Включаем в рабочий датасет только 10 тысяч самых популярных пар

In [5]:
songs = df['song'].value_counts()
most_popular = set(songs[:10000].index)
df = df[df['song'].isin(most_popular)]
df.shape

(2670918, 2)

Делим данные на треин и тест

In [6]:
train, test = train_test_split(df, test_size=0.15)

In [7]:
train.shape, test.shape

((2270280, 2), (400638, 2))

In [8]:
train.head(2)

Unnamed: 0,user_id,song
8177716,5d94a3a81df06c4bbd821bedef0e6897,The Cardigans__Lovefool
12055012,e11595d68e90f9bec6c9ac461ec38272,The Flaming Lips__Do You Realize??


In [9]:
len(set(train['user_id'])), len(set(test['user_id']))

(14930, 13640)

Уберем холодных пользователей из обеих выборок так, что бы пользователи были и там и там

In [10]:
len(list(set(train['user_id'])-set(test['user_id']))) # нет в тесте из треина

1345

In [11]:
len(list(set(test['user_id'])-set(train['user_id'])))# нет в треине из теста

55

In [12]:
len(set(train['user_id'])& set(test['user_id'])) # пересечение мгожеств

13585

In [13]:
train = train[train['user_id'].isin(list(set(train['user_id'])& set(test['user_id'])))]
train.shape

(2264373, 2)

In [14]:
test = test[test['user_id'].isin(list(set(train['user_id'])& set(test['user_id'])))]
test.shape

(400572, 2)

In [44]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

Закодируем информацию

In [15]:
users = set(train['user_id'])

items = set(test.song.unique()).union(set(train.song.unique()))

In [16]:
user_encoder, item_encoder = LabelEncoder(), LabelEncoder()
user_encoder.fit(list(users))
item_encoder.fit(list(items))

train['new_user_id'] = user_encoder.transform(train['user_id'])
test['new_user_id'] = user_encoder.transform(test['user_id'])

train['song_id'] = item_encoder.transform(train['song'])
test['song_id'] = item_encoder.transform(test['song'])

In [17]:
train['connect'] = 1
test['connect'] = 1

In [18]:
train

Unnamed: 0,user_id,song,new_user_id,song_id,connect
8177716,5d94a3a81df06c4bbd821bedef0e6897,The Cardigans__Lovefool,5057,8379,1
12055012,e11595d68e90f9bec6c9ac461ec38272,The Flaming Lips__Do You Realize??,11940,8522,1
3634214,a81dca2220186ff0cefd814a9ba35241,Meghan Trainor__All About That Bass,8915,5641,1
411150,1e54539c9134fd0a7d94ac7996a101bf,Justin Timberlake__Mirrors,1622,4346,1
1306636,99deafd9b792af8e6a535483088faef2,Conor Maynard__Turn Around - feat. Ne-Yo,8183,1708,1
...,...,...,...,...,...
5599811,57c1c8ee39d783a4aa5a7494932af9ea,Lady Gaga__Yoü And I,4718,4870,1
2180517,29b928527a8ee8f94a7540b90874a436,Eminem__Stan,2239,2677,1
9281224,e778fef0bdffe382118fd5ce0ebbb60b,Of Monsters and Men__Little Talks,12272,6374,1
1618189,c1cc8215fe821df3e148d8f8f1a81cac,The White Stripes__The Denial Twist,10283,9203,1


In [None]:
### del df

In [19]:
matrix_shape = len(user_encoder.classes_), len(item_encoder.classes_)
matrix_shape

train_sparse = coo_matrix((list(train.connect.astype(np.int8)), 
                           (list(train.new_user_id.astype(np.int64)), 
                            list(train.song_id.astype(np.int64)))), shape=matrix_shape)


test_sparse = coo_matrix((list(test.connect.astype(np.int8)),
                          (list(test.new_user_id.astype(np.int64)), 
                           list(test.song_id.astype(np.int64)))), shape=matrix_shape)

Получили готовые разряженные спарс матрицы для дальнейшей работы

In [20]:
train_sparse.shape, test_sparse.shape


((13585, 10000), (13585, 10000))

In [43]:
sparse.save_npz("train_sparse.npz", train_sparse)
sparse.save_npz("test_sparse.npz", test_sparse)

# Напишем основные функции

In [90]:
def metric_lightfm(model, test_sparse, indices, total, k=20, user_features = None, item_features = None):
    
    ranks = model.predict_rank(test_sparse, 
                               num_threads=60, 
                               check_intersections=True,
                               user_features=user_features,
                               item_features = item_features)
    mask = ranks.copy()
    mask.data = np.less(mask.data, k, mask.data)
    ranks.data += 1
    ranks.data = ranks.data * mask.data
    ranks.eliminate_zeros()
    ranks = ranks.tolil().data
    average_precision_sum = 0.0
    for x in indices:
        n_correct_items = 0
        precision = 0
        for y in sorted(ranks[x]):
            n_correct_items += 1
            precision += n_correct_items / y
        average_precision_sum += precision / min(total[x], k)
    average_precision_sum /= len(indices)
    return average_precision_sum

In [91]:
def lightFM_eval(no_components = 10, 
                 loss='warp', 
                 random_state = 42,
                 learning_rate = 0.1,
                 test_sparse = test_sparse,
                 train_sparse = train_sparse,
                 sample_weight = train_sparse,
                 epochs = 10,
                 num_threads = 1,
                 user_features = None,
                 item_features = None,
                 k = 20):
    
    model = LightFM(no_components=no_components,
                    loss=loss, 
                    random_state=random_state, 
                    learning_rate=learning_rate)

    total = test_sparse.getnnz(axis=1)
    indices = np.nonzero(total)[0]

    maps = []
    epochs = epochs
    rounds = 10
    for rounds in tqdm(range(rounds)): 
        model.fit_partial(train_sparse, 
                          sample_weight = sample_weight, 
                          epochs = epochs,
                          num_threads = num_threads)
        
        curr_metric = metric_lightfm(model, test_sparse, indices, total, k, user_features, item_features)
        maps.append(curr_metric)
        print(curr_metric)
    
    return model

# Модель без фичей

Модель на 100 эпох (тк 10 раундов по 10 эпох)

In [34]:
lightFM_eval(no_components = 100, loss='warp',
             random_state = 42, learning_rate = 0.01,
             test_sparse = test_sparse, train_sparse = train_sparse,
             sample_weight = train_sparse, epochs = 10, num_threads = 6)

 10%|████▍                                       | 1/10 [00:58<08:44, 58.23s/it]

0.01930496133089331


 20%|████████▊                                   | 2/10 [01:55<07:41, 57.72s/it]

0.024246494361567283


 30%|█████████████▏                              | 3/10 [02:51<06:38, 56.87s/it]

0.027786423854878523


 40%|█████████████████▌                          | 4/10 [03:46<05:37, 56.31s/it]

0.030427042460293995


 50%|██████████████████████                      | 5/10 [04:41<04:38, 55.69s/it]

0.032812505270650334


 60%|██████████████████████████▍                 | 6/10 [05:35<03:40, 55.13s/it]

0.03480654363114252


 70%|██████████████████████████████▊             | 7/10 [06:29<02:44, 54.69s/it]

0.036196768993656196


 80%|███████████████████████████████████▏        | 8/10 [07:22<01:48, 54.27s/it]

0.03770969648989674


 90%|███████████████████████████████████████▌    | 9/10 [08:15<00:53, 53.91s/it]

0.03875880684557411


100%|███████████████████████████████████████████| 10/10 [09:08<00:00, 54.86s/it]

0.040012353736988304





<lightfm.lightfm.LightFM at 0x7f87ea6344c0>

Следущую ячейку не перезапускаю, писал ее еще до того как функцию написал, но там лучшие результаты precision@20 = 0.048

In [143]:
model_lfm_without_features = LightFM(no_components=100, 
                             loss='warp', 
                             random_state=42, 
                             learning_rate=0.01)

total = test_sparse.getnnz(axis=1)
indices = np.nonzero(total)[0]

maps = []
epochs = 100
rounds = 10
for rounds in tqdm(range(rounds)): 
    model_lfm_without_features.fit_partial(train_sparse, 
                                           sample_weight=train_sparse, 
                                           epochs=epochs, 
                                           num_threads=6)
    curr_metric = metric_lightfm(model_lfm_without_features, test_sparse, None, indices, total, k=20)
    maps.append(curr_metric)
    print(curr_metric)

 10%|████                                     | 1/10 [07:48<1:10:15, 468.38s/it]

0.03993001657528792


 20%|████████▏                                | 2/10 [15:07<1:00:09, 451.23s/it]

0.04581503782412398


 30%|████████████▉                              | 3/10 [22:11<51:10, 438.65s/it]

0.048318488526773555


 40%|█████████████████▏                         | 4/10 [29:05<42:54, 429.07s/it]

0.049109086088865


 50%|█████████████████████▌                     | 5/10 [35:51<35:04, 420.85s/it]

0.049319722799460626


 60%|█████████████████████████▊                 | 6/10 [42:36<27:41, 415.30s/it]

0.04929618440978424


 70%|██████████████████████████████             | 7/10 [49:13<20:28, 409.41s/it]

0.04917116343980187


 80%|██████████████████████████████████▍        | 8/10 [55:38<13:22, 401.48s/it]

0.04891423134014395


 90%|████████████████████████████████████▉    | 9/10 [1:01:50<06:32, 392.33s/it]

0.04875800736384691


100%|████████████████████████████████████████| 10/10 [1:08:00<00:00, 408.05s/it]

0.04820628570725821





# Займемся фичами песен


In [36]:
track_info = pd.read_csv('tracks_feats.csv', sep = '\t')
track_info.head(10)

Unnamed: 0,artist,song_name,album,genre,mood,style,theme,duration
0,M83,Midnight City,"Hurry Up, We're Dreaming",Synthpop,Happy,Electronic,...,243
1,Imagine Dragons,Radioactive,Night Visions,Indie,,,,186
2,Of Monsters and Men,Little Talks,My Head Is an Animal,Indie,Relaxed,Folk,...,266
3,Avicii,Wake Me Up,True,House,Gentle,Electronic,Club,249
4,Lorde,Royals,The Love Club EP,Pop,,,,190
5,The Lumineers,Ho Hey,The Lumineers,Rock,,,,163
6,Bastille,Pompeii,Bad Blood,Alternative Rock,Relaxed,Rock/Pop,...,214
7,Robin Thicke,Blurred Lines,Blurred Lines EP,Soul,Sensual,Urban/R&B,...,263
8,Foster the People,Pumped Up Kicks,Torches,Alternative Rock,,,,240
9,Nirvana,Smells Like Teen Spirit,Nevermind,Grunge,Sad,Rock/Pop,...,301


In [37]:
track_info['track'] = track_info['artist'] + '__'+track_info['song_name']

In [38]:
track_info.describe(include = 'all')

Unnamed: 0,artist,song_name,album,genre,mood,style,theme,duration,track
count,248134,248134,248134,152152,7148,7148,6947,248134.0,248134
unique,16168,174752,45129,140,47,24,26,,238817
top,Johnny Cash,Intro,Greatest Hits,Alternative Rock,...,Rock/Pop,...,,Édith Piaf__Hymne à l'amour
freq,598,160,372,15368,3052,4065,6764,,5
mean,,,,,,,,232.734083,
std,,,,,,,,119.091457,
min,,,,,,,,0.0,
25%,,,,,,,,185.0,
50%,,,,,,,,226.0,
75%,,,,,,,,272.0,


In [39]:
track_info.shape

(248134, 9)

In [40]:
track_info['genre'] = track_info['genre'].fillna('NaN')
track_info = track_info.replace(['...'],['NaN'])

In [41]:
len(set(track_info['track'])&items)

6346

In [42]:
len(items)

10000

пупупупупупупууууу

Получилось так, что у нас апишка не все топ 10к фичи айтемов выгрузила и Было принято решение сократить датасет

In [43]:
new_track_info = track_info[track_info['track'].isin(items)]
new_track_info['track_id'] = item_encoder.transform(new_track_info['track'])
new_track_info = new_track_info.drop_duplicates()#.shape
new_track_info.head(2)

Unnamed: 0,artist,song_name,album,genre,mood,style,theme,duration,track,track_id
0,M83,Midnight City,"Hurry Up, We're Dreaming",Synthpop,Happy,Electronic,,243,M83__Midnight City,5305
1,Imagine Dragons,Radioactive,Night Visions,Indie,,,,186,Imagine Dragons__Radioactive,3658


Надо хешануть жанры

In [44]:
genre_encoder = LabelEncoder()

genre_encoder.fit(list(set(new_track_info['genre'])))

new_track_info['genre_id'] = genre_encoder.transform(new_track_info['genre'])

In [45]:
aaa = new_track_info[['track_id', 'duration', 'artist', 'genre_id']]
aaa

Unnamed: 0,track_id,duration,artist,genre_id
0,5305,243,M83,82
1,3658,186,Imagine Dragons,41
2,6374,266,Of Monsters and Men,41
3,541,249,Avicii,40
4,5168,190,Lorde,58
...,...,...,...,...
8286,9326,223,Toby Keith,52
8287,6266,278,Nine Inch Nails,44
8288,5713,245,Metronomy,54
8291,5760,338,Michael Jackson,58


In [46]:
aaa_pivot = aaa.pivot(index = 'track_id', columns = 'genre_id', values = 'duration')

In [47]:
shape = (len(aaa['track_id'].unique()), len(aaa['genre_id'].unique()))
shape

(6346, 88)

In [48]:
nuka = scipy.sparse.csr_matrix(aaa_pivot.values)

# Обрежем данные под те, что есть в фичах

In [49]:
tracks_in_features = set(aaa.track_id)

In [50]:
len(tracks_in_features)

6346

In [51]:
cut_train = train[train['song_id'].isin(tracks_in_features)]

cut_test = test[test['song_id'].isin(tracks_in_features)]

cut_train = cut_train[cut_train['new_user_id'].isin(set(cut_train['new_user_id']) & set(cut_test['new_user_id']))]
cut_test = cut_test[cut_test['new_user_id'].isin(set(cut_train['new_user_id']) & set(cut_test['new_user_id']))]

In [52]:
cut_test.head(2)

Unnamed: 0,user_id,song,new_user_id,song_id,connect
547000,656541f8c9c6919b36a9a20a71b1bfe1,Cat Power__Sea of Love,5454,1426,1
3615951,1ce89904757d749a9caa9c128471779e,"Jeff Buckley__Lover, You Should've Come Over",1542,4030,1


Проверяем данные

In [53]:
len(set(cut_train['new_user_id']) & set(cut_test['new_user_id']))

12994

In [54]:
len(set(cut_train['new_user_id'])), len(set(cut_test['new_user_id']))

(12994, 12994)

In [55]:
len(set(cut_train['song_id'])), len(set(cut_test['song_id']))

(6346, 6346)

In [56]:
from lightfm.data import Dataset # тут некоторая магия создания разряженной матрицы

In [57]:
cut_train_dataset = Dataset()
cut_train_dataset.fit(cut_train['new_user_id'],
                      cut_train['song_id'])

In [58]:
cut_train.iloc[:, [2, 3]]

Unnamed: 0,new_user_id,song_id
8177716,5057,8379
12055012,11940,8522
3634214,8915,5641
411150,1622,4346
2560609,12157,7620
...,...,...
9040656,989,9589
2712393,12238,9105
2180517,2239,2677
9281224,12272,6374


In [59]:
(cut_train_coo, weights) = cut_train_dataset.build_interactions((x[1],x[0]) for x in cut_train.iloc[:, [3, 2]].to_numpy())

print(repr(cut_train_coo))

<12994x6346 sparse matrix of type '<class 'numpy.int32'>'
	with 1493445 stored elements in COOrdinate format>


In [60]:
cut_test_dataset = Dataset()
cut_test_dataset.fit(cut_test['new_user_id'],
                     cut_test['song_id'])

In [61]:
(cut_test_coo, weights) = cut_test_dataset.build_interactions((x[1],x[0]) for x in cut_test.iloc[:, [3, 2]].to_numpy())

print(repr(cut_test_coo))


<12994x6346 sparse matrix of type '<class 'numpy.int32'>'
	with 264562 stored elements in COOrdinate format>


In [76]:
# items_dataset = Dataset() # решение оказалось не верным, нужно вместе с датасетом формировать, оставил, что бы помнить
# items_dataset.fit(aaa['track_id'], aaa['genre_id'])

# (items_coo, weights) = items_dataset.build_interactions((x[1],x[0]) for x in aaa.iloc[:, [2, 0]].to_numpy())

# print(repr(items_coo))

Опробуем модель на новых данных

In [62]:
lightFM_eval(no_components = 100, loss='warp',
             random_state = 42, learning_rate = 0.01,
             test_sparse = cut_test_coo, train_sparse = cut_train_coo,
             sample_weight = cut_train_coo, epochs = 10, num_threads = 6)

 10%|████▍                                       | 1/10 [00:36<05:24, 36.02s/it]

0.0012030438900552892


 20%|████████▊                                   | 2/10 [01:11<04:44, 35.62s/it]

0.0012583158310268251


 30%|█████████████▏                              | 3/10 [01:46<04:07, 35.30s/it]

0.0012005356306129104


 40%|█████████████████▌                          | 4/10 [02:20<03:30, 35.01s/it]

0.001201241035691643


 50%|██████████████████████                      | 5/10 [02:55<02:53, 34.71s/it]

0.0011859762815851398


 60%|██████████████████████████▍                 | 6/10 [03:28<02:17, 34.43s/it]

0.0012031714057291725


 70%|██████████████████████████████▊             | 7/10 [04:02<01:42, 34.16s/it]

0.0012100148419135954


 80%|███████████████████████████████████▏        | 8/10 [04:36<01:07, 33.96s/it]

0.0012110383893866987


 90%|███████████████████████████████████████▌    | 9/10 [05:09<00:33, 33.75s/it]

0.0011996443657855294


100%|███████████████████████████████████████████| 10/10 [05:42<00:00, 34.24s/it]

0.0011760394602754291





<lightfm.lightfm.LightFM at 0x7f87e4ee7dc0>

Качество сильно упало((((

Но все равно попробуем добавить фичи, иначе все было зря)

# Модель c фичами

Для этого пришлось несного подшаманить данные, с имеющимися не работало (писал в законмментированной ячейке)

## немного правлю данные

In [64]:
df = cut_train.iloc[:, 2:].copy(deep = True)

In [65]:
df_test = cut_test.iloc[:, 2:].copy(deep = True)

In [67]:
features = aaa.iloc[:, [0, 2, 3]]
# features

In [68]:
true_solution = Dataset()

In [69]:
uf = []
col = ['genre']*len(features.genre_id.unique()) + ['artist']*len(features.artist.unique())
unique_f1 = list(features.genre_id.unique()) + list(features.artist.unique()) 
#print('f1:', unique_f1)
for x,y in zip(col, unique_f1):
    res = str(x)+ ":" +str(y)
    uf.append(res)
#     print(res)

In [70]:
uf[:3]

['genre:82', 'genre:41', 'genre:40']

In [71]:
true_solution.fit(
        df['new_user_id'].unique(), # all the users
        df['song_id'].unique(), # all the items
        item_features = uf # additional item features
)

In [72]:
(interactions, weights) = true_solution.build_interactions([(x[0], x[1], x[2]) for x in df.values])

In [78]:
def feature_colon_value(my_list):

    result = []
    ll = ['genre:', 'artist:']
    aa = my_list
    for x,y in zip(ll,aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

In [79]:
ad_subset = features[["genre_id", "artist"]] 
ad_list = [list(x) for x in ad_subset.values]
feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
# print(f'Final output: {feature_list}')

In [80]:
item_tuple = list(zip(features.track_id, feature_list))

In [81]:
item_tuple[:3]

[(5305, ['genre:82', 'artist:M83']),
 (3658, ['genre:41', 'artist:Imagine Dragons']),
 (6374, ['genre:41', 'artist:Of Monsters and Men'])]

In [82]:
item_features = true_solution.build_item_features(item_tuple, normalize= False)

In [83]:
true_test_dataset = Dataset()
true_test_dataset.fit(
        df_test['new_user_id'].unique(), # all the users
        df_test['song_id'].unique(), # all the items
        item_features = uf # additional user features
)

(test_interactions, test_weights) = true_test_dataset.build_interactions([(x[0], x[1], x[2]) for x in df_test.values])

Теперь все имеется, можно тестить модель

In [93]:
interactions

<12994x6346 sparse matrix of type '<class 'numpy.float32'>'
	with 1493445 stored elements in COOrdinate format>

In [95]:
item_features.T

<8130x6346 sparse matrix of type '<class 'numpy.float32'>'
	with 19038 stored elements in Compressed Sparse Column format>

## Тестируем модель с фичами

In [97]:
model_with_features = lightFM_eval(no_components = 100, loss='warp',
                                 random_state = 42, learning_rate = 0.01,
                                 test_sparse = test_interactions, train_sparse = interactions,
                                 sample_weight = interactions, epochs = 10, num_threads = 6,
                                 item_features = item_features.T)

 10%|████▍                                       | 1/10 [00:35<05:20, 35.65s/it]

0.0012030438900552892


 20%|████████▊                                   | 2/10 [01:11<04:44, 35.56s/it]

0.0012583158310268251


 30%|█████████████▏                              | 3/10 [01:46<04:07, 35.29s/it]

0.0012005356306129104


 40%|█████████████████▌                          | 4/10 [02:20<03:30, 35.01s/it]

0.001201241035691643


 50%|██████████████████████                      | 5/10 [02:55<02:53, 34.79s/it]

0.0011859762815851398


 60%|██████████████████████████▍                 | 6/10 [03:28<02:17, 34.49s/it]

0.0012031714057291725


 70%|██████████████████████████████▊             | 7/10 [04:02<01:42, 34.20s/it]

0.0012100148419135954


 80%|███████████████████████████████████▏        | 8/10 [04:36<01:07, 33.95s/it]

0.0012110383893866987


 90%|███████████████████████████████████████▌    | 9/10 [05:09<00:33, 33.76s/it]

0.0011996443657855294


100%|███████████████████████████████████████████| 10/10 [05:42<00:00, 34.25s/it]

0.0011760394602754291





Качество относительно последней модели не изменилось

Дальше с этой моделью будем пытаться подбирать гиперпараметры, крутить фичи и добавлять новые

Возможно, получится достать фичи тех песен, которые у нас не подтянулись, что бы не сокращать датасет (А именно из-за этого и упало качество)

Как итог -- лучшая модель на данный момент это модель на наборе данных с 10к самых популярных песен без доп фичей на 1 тысяче эпох.

precision at 20 = 0.048

Так же в дальнейшем планируется подготовить алгоритмы аггрегирования предсказаний для высчитывания других метрик (они уже частично разработаны в других файлах)