In [1]:
import requests
import json
import pandas as pd
import tqdm
import time
import numpy as np
from scipy import spatial
import scipy as sc
from scipy.sparse import csr_matrix

In [2]:
from lightfm import LightFM

from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import reciprocal_rank
from lightfm.evaluation import auc_score



In [3]:
# from platform import python_version
# print(python_version())

In [3]:
import matplotlib.pyplot as plt 
import seaborn as sns           
%matplotlib inline

Загрузим датасет

In [4]:
%%time

pdf = pd.read_csv('pivot_df.csv')
pdf.head()

CPU times: user 108 ms, sys: 16.1 ms, total: 124 ms
Wall time: 131 ms


Unnamed: 0,rating,user_id,song_id,artist_id
0,1.0,0,0,0
1,0.98,0,2,0
2,0.97,0,3,0
3,0.96,0,4,0
4,0.94,0,6,0


## Создам датасет и разобью на трейн/ тест

In [6]:
%%time 

from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split

# Создаем Dataset для того, чтобы впоследствии корректно разбить данные на трейн и тест 
dataset = Dataset()
dataset.fit((x for x in pdf['user_id']), 
            (x for x in pdf['song_id']))

# Создаем матрицу взаимодействия и веса 
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in pdf[['user_id', 'song_id', 'rating']].values])

# Разбиваем interactions и веса на train и test
train, test = random_train_test_split(interactions, test_percentage=0.2, random_state = 42)
train_weights, test_weights = random_train_test_split(weights, test_percentage=0.2, random_state = 42)

print('Размер матрицы interactions: ', repr(interactions))
print()
print('Размер train: ', repr(train))
print('Размер test: ', repr(test))
print()
print('Размер train_weights: ', repr(train_weights))
print('Размер test_weights: ', repr(test_weights))


# https://making.lyst.com/lightfm/docs/cross_validation.html
# lightfm.cross_validation.random_train_test_split

Размер матрицы interactions:  <9874x265360 sparse matrix of type '<class 'numpy.int32'>'
	with 493700 stored elements in COOrdinate format>

Размер train:  <9874x265360 sparse matrix of type '<class 'numpy.int32'>'
	with 394960 stored elements in COOrdinate format>
Размер test:  <9874x265360 sparse matrix of type '<class 'numpy.int32'>'
	with 98740 stored elements in COOrdinate format>

Размер train_weights:  <9874x265360 sparse matrix of type '<class 'numpy.float32'>'
	with 394960 stored elements in COOrdinate format>
Размер test_weights:  <9874x265360 sparse matrix of type '<class 'numpy.float32'>'
	with 98740 stored elements in COOrdinate format>
CPU times: user 1.58 s, sys: 46.1 ms, total: 1.63 s
Wall time: 1.66 s


## Качество на 25 компонентах, 10 эпох

In [7]:
%%time

# Показатели на train'е

model_25 = LightFM(loss='warp', random_state=2023, no_components=25)
model_25.fit(train, epochs=10)

train_precision_25 = precision_at_k(model_25, train, k=5).mean()
train_recall_25 = recall_at_k(model_25, train, k=5).mean()
train_auc_25 = auc_score(model_25, train).mean()
train_reciprocal_rank_25 = reciprocal_rank(model_25, train).mean()

print('Precision: train %.2f' % (train_precision_25))
print("Recall: train %.2f" % (train_recall_25))

print('AUC: train %.2f' % (train_auc_25))
print("Reciprocal_rank: train %.2f" % (train_reciprocal_rank_25))

Precision: train 0.33
Recall: train 0.04
AUC: train 1.00
Reciprocal_rank: train 0.50
CPU times: user 10min 32s, sys: 1.33 s, total: 10min 34s
Wall time: 10min 34s


In [8]:
%%time 

# Показатели на test'е

test_precision_25 = precision_at_k(model_25, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_recall_25 = recall_at_k(model_25, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_auc_25 = auc_score(model_25, test_interactions = test, train_interactions = train, check_intersections=False).mean()
test_reciprocal_rank_25 = reciprocal_rank(model_25, test_interactions = test, train_interactions = train, check_intersections=False).mean()

print('Precision: test %.2f' % (test_precision_25))
print("Recall: test %.2f" % (test_recall_25))

print('AUC: test %.2f' % (test_auc_25))
print("Reciprocal_rank: test %.2f" % (test_reciprocal_rank_25))


Precision: test 0.03
Recall: test 0.01
AUC: test 0.59
Reciprocal_rank: test 0.08
CPU times: user 15min 49s, sys: 1.53 s, total: 15min 50s
Wall time: 53min 24s


## Качество на 25 компонентах, 20 эпох

In [9]:
%%time

# Показатели на train'е

model_25_20 = LightFM(loss='warp', random_state=2023, no_components=25)
model_25_20.fit(train, epochs=20)

train_precision_25_20 = precision_at_k(model_25_20, train, k=5).mean()
train_recall_25_20 = recall_at_k(model_25_20, train, k=5).mean()
train_auc_25_20 = auc_score(model_25_20, train).mean()
train_reciprocal_rank_25_20 = reciprocal_rank(model_25_20, train).mean()

print('Precision: train %.2f' % (train_precision_25_20))
print("Recall: train %.2f" % (train_recall_25_20))

print('AUC: train %.2f' % (train_auc_25_20))
print("Reciprocal_rank: train %.2f" % (train_reciprocal_rank_25_20))

Precision: train 0.38
Recall: train 0.05
AUC: train 1.00
Reciprocal_rank: train 0.55
CPU times: user 10min 33s, sys: 883 ms, total: 10min 34s
Wall time: 14min 34s


In [10]:
%%time 

# Показатели на test'е

test_precision_25_20 = precision_at_k(model_25_20, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_recall_25_20 = recall_at_k(model_25_20, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_auc_25_20 = auc_score(model_25_20, test_interactions = test, train_interactions = train, check_intersections=False).mean()
test_reciprocal_rank_25_20 = reciprocal_rank(model_25_20, test_interactions = test, train_interactions = train, check_intersections=False).mean()

print('Precision: test %.2f' % (test_precision_25_20))
print("Recall: test %.2f" % (test_recall_25_20))

print('AUC: test %.2f' % (test_auc_25_20))
print("Reciprocal_rank: test %.2f" % (test_reciprocal_rank_25_20))

Precision: test 0.03
Recall: test 0.01
AUC: test 0.60
Reciprocal_rank: test 0.07
CPU times: user 15min 51s, sys: 1.83 s, total: 15min 53s
Wall time: 19min 43s


## Качество на 75 компонентах, 10 эпох

In [11]:
%%time

# Показатели на train'е

model_75 = LightFM(loss='warp', random_state=2023, no_components=75)
model_75.fit(train, epochs=10)

train_precision_75 = precision_at_k(model_75, train, k=5).mean()
train_recall_75 = recall_at_k(model_75, train, k=5).mean()
train_auc_75 = auc_score(model_75, train).mean()
train_reciprocal_rank_75 = reciprocal_rank(model_75, train).mean()

print('Precision: train %.2f' % (train_precision_75))
print("Recall: train %.2f" % (train_recall_75))

print('AUC: train %.2f' % (train_auc_75))
print("Reciprocal_rank: train %.2f" % (train_reciprocal_rank_75))

Precision: train 0.78
Recall: train 0.10
AUC: train 1.00
Reciprocal_rank: train 0.89
CPU times: user 13min 4s, sys: 1.49 s, total: 13min 5s
Wall time: 13min 6s


In [12]:
%%time 

# Показатели на test'е

test_precision_75 = precision_at_k(model_75, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_recall_75 = recall_at_k(model_75, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_auc_75 = auc_score(model_75, test_interactions = test, train_interactions = train, check_intersections=False).mean()
test_reciprocal_rank_75 = reciprocal_rank(model_75, test_interactions = test, train_interactions = train, check_intersections=False).mean()

print('Precision: test %.2f' % (test_precision_75))
print("Recall: test %.2f" % (test_recall_75))

print('AUC: test %.2f' % (test_auc_75))
print("Reciprocal_rank: test %.2f" % (test_reciprocal_rank_75))


Precision: test 0.05
Recall: test 0.02
AUC: test 0.60
Reciprocal_rank: test 0.13
CPU times: user 18min 5s, sys: 4.53 s, total: 18min 10s
Wall time: 18min 10s


## Качество на 75 компонентах, 20 эпох

In [15]:
%%time

# Показатели на train'е

model_75_20 = LightFM(loss='warp', random_state=2023, no_components=75)
model_75_20.fit(train, epochs=20)

train_precision_75_20 = precision_at_k(model_75_20, train, k=5).mean()
train_recall_75_20 = recall_at_k(model_75_20, train, k=5).mean()
train_auc_75_20 = auc_score(model_75_20, train).mean()
train_reciprocal_rank_75_20 = reciprocal_rank(model_75_20, train).mean()

print('Precision: train %.2f' % (train_precision_75_20))
print("Recall: train %.2f" % (train_recall_75_20))

print('AUC: train %.2f' % (train_auc_75_20))
print("Reciprocal_rank: train %.2f" % (train_reciprocal_rank_75_20))

Precision: train 0.87
Recall: train 0.11
AUC: train 1.00
Reciprocal_rank: train 0.94
CPU times: user 13min 15s, sys: 2.38 s, total: 13min 17s
Wall time: 13min 18s


In [16]:
%%time 

# Показатели на test'е

test_precision_75_20 = precision_at_k(model_75_20, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_recall_75_20 = recall_at_k(model_75_20, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_auc_75_20 = auc_score(model_75_20, test_interactions = test, train_interactions = train, check_intersections=False).mean()
test_reciprocal_rank_75_20 = reciprocal_rank(model_75_20, test_interactions = test, train_interactions = train, check_intersections=False).mean()

print('Precision: test %.2f' % (test_precision_75_20))
print("Recall: test %.2f" % (test_recall_75_20))

print('AUC: test %.2f' % (test_auc_75_20))
print("Reciprocal_rank: test %.2f" % (test_reciprocal_rank_75_20))

Precision: test 0.05
Recall: test 0.03
AUC: test 0.60
Reciprocal_rank: test 0.14
CPU times: user 18min 4s, sys: 3.09 s, total: 18min 8s
Wall time: 18min 8s


## Качество на 75 компонентах, 20 эпох, использую weights

In [17]:
%%time

# Показатели на train'е

model_75_20_weights = LightFM(loss='warp', random_state=2023, no_components=75)
model_75_20_weights.fit(train,sample_weight = train_weights, epochs=20)

train_precision_75_20_weights = precision_at_k(model_75_20_weights, train, k=5).mean()
train_recall_75_20_weights = recall_at_k(model_75_20_weights, train, k=5).mean()
train_auc_75_20_weights = auc_score(model_75_20_weights, train).mean()
train_reciprocal_rank_75_20_weights = reciprocal_rank(model_75_20_weights, train).mean()

print('Precision: train %.2f' % (train_precision_75_20_weights))
print("Recall: train %.2f" % (train_recall_75_20_weights))

print('AUC: train %.2f' % (train_auc_75_20_weights))
print("Reciprocal_rank: train %.2f" % (train_reciprocal_rank_75_20_weights))

Precision: train 0.85
Recall: train 0.11
AUC: train 1.00
Reciprocal_rank: train 0.93
CPU times: user 13min 18s, sys: 2.36 s, total: 13min 20s
Wall time: 13min 20s


In [18]:
%%time 

# Показатели на test'е

test_precision_75_20_weights = precision_at_k(model_75_20_weights, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_recall_75_20_weights = recall_at_k(model_75_20_weights, test_interactions = test, train_interactions = train, k=5, check_intersections=False).mean()
test_auc_75_20_weights = auc_score(model_75_20_weights, test_interactions = test, train_interactions = train, check_intersections=False).mean()
test_reciprocal_rank_75_20_weights = reciprocal_rank(model_75_20_weights, test_interactions = test, train_interactions = train, check_intersections=False).mean()

print('Precision: test %.2f' % (test_precision_75_20_weights))
print("Recall: test %.2f" % (test_recall_75_20_weights))

print('AUC: test %.2f' % (test_auc_75_20_weights))
print("Reciprocal_rank: test %.2f" % (test_reciprocal_rank_75_20_weights))

Precision: test 0.05
Recall: test 0.03
AUC: test 0.61
Reciprocal_rank: test 0.15
CPU times: user 18min 2s, sys: 3.27 s, total: 18min 5s
Wall time: 18min 6s


**Вывод:**   
Ключевая метрика качества - precision (k=5) составляет на тесте 0.05

## Черновик (не смотреть)

In [95]:
# %%time
# predict = model_75_20.predict(user_ids = np.array([0,1,2,5,100, 9873]), item_ids = np.array([0,1,2,100, 265300,265359]))
# predict

CPU times: user 2.58 ms, sys: 660 µs, total: 3.24 ms
Wall time: 3 ms


array([ 1.178301 , -1.3454835, -2.786327 , -2.1459653, -2.1937795,
       -2.9295583], dtype=float32)

In [59]:
# %%time
# rank = model_75_20.predict_rank(test_interactions = test, train_interactions = train, check_intersections=False)
# rank

CPU times: user 4min 35s, sys: 748 ms, total: 4min 36s
Wall time: 4min 36s


<9874x265360 sparse matrix of type '<class 'numpy.float32'>'
	with 98578 stored elements in Compressed Sparse Row format>

In [60]:
# print(rank)

  (0, 5)	197082.0
  (0, 13)	186105.0
  (0, 15)	233153.0
  (0, 17)	185810.0
  (0, 21)	150706.0
  (0, 97)	178554.0
  (0, 102)	78134.0
  (0, 241)	217716.0
  (0, 242)	224642.0
  (0, 387)	3656.0
  (0, 578)	881.0
  (0, 731)	213203.0
  (1, 5050)	26.0
  (1, 5079)	2591.0
  (1, 34820)	14913.0
  (1, 36879)	159908.0
  (1, 38154)	52407.0
  (1, 49075)	32345.0
  (1, 99604)	156827.0
  (1, 110329)	242273.0
  (1, 136145)	199362.0
  (2, 16)	215149.0
  (2, 21818)	180728.0
  (2, 21819)	78735.0
  (2, 67454)	218106.0
  :	:
  (9871, 229377)	156332.0
  (9871, 229380)	8850.0
  (9871, 229383)	79048.0
  (9871, 229390)	102313.0
  (9872, 230559)	146262.0
  (9872, 230564)	189312.0
  (9872, 230568)	245559.0
  (9872, 230571)	145029.0
  (9872, 230577)	216783.0
  (9872, 230585)	179128.0
  (9872, 230595)	133387.0
  (9873, 260322)	195346.0
  (9873, 260330)	182312.0
  (9873, 260332)	169059.0
  (9873, 260335)	127417.0
  (9873, 260340)	244398.0
  (9873, 260341)	206269.0
  (9873, 260349)	169627.0
  (9873, 260352)	180980.0
  (