In [1]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import scrapbook as sb

# import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k



In [2]:
df = pd.read_json('datasets/spot_final_explode.json')
df = df.drop(columns=['itemID','userID'])

In [3]:
df.head(3)

Unnamed: 0,track_uri,name,artists_names,popularity,album_type,playlist_uris,danceability,energy,key,loudness,...,artists_genres,release_year,artists_mean_popularities,artists_mean_followers,lyrics,n_playlist,tt_score,song_by,rating,playlist_mean_tt
0,spotify:track:41MOCUNOgWtaYBFUsGnpZ5,The Safety Dance,Men Without Hats,37,album,spotify:playlist:519N2pOOQrNuVt3hJ7GF7S,0.572,0.837,5,-7.876,...,"['wave', 'pop', 'romantic', 'synthpop', 'new',...",1982,55.0,208619.0,We can dance if we want to We can leave your ...,1,5.418839,The Safety Dance by Men Without Hats,1.550904,3.493987
1,spotify:track:24kirWnsEEyGgbl2NuhSVz,Endless Summer,Grizfolk,45,single,spotify:playlist:1YKxkgvu0TEfOfsXQz7EyX,0.498,0.727,4,-6.116,...,"['modern', 'stomp', 'indie', 'poptimism', 'and...",2018,52.0,143241.0,"Take it back, it's over time Ooh, you know it...",1,4.115897,Endless Summer by Grizfolk,1.171918,3.512102
2,spotify:track:5PNcJn4oFNvlRfrZBHfqWh,Castaway,Zac Brown Band,0,album,spotify:playlist:60RvK7JfGXL4zFVFxmmzOT,0.611,0.782,8,-4.989,...,"['country', 'modern', 'contemporary', 'road', ...",2015,73.0,3498692.0,Castaway Ride the waves like we’re young 'Cau...,2,4.085013,Castaway by Zac Brown Band,0.914342,4.467707


In [4]:
from sklearn.preprocessing import LabelEncoder
import ast

In [5]:
def tt_score(df):
  num = df['danceability'] + df['speechiness'] + 1/abs(df['loudness']) + df['energy'] + df['valence'] + np.log(df['popularity'] + 1) + np.log(df['artists_mean_popularities'] + 1) + np.log10(df['artists_mean_followers'] + 1)
  den = df['instrumentalness'] + df['acousticness'] + (df['duration_ms']/60000)
  return num/den

In [6]:
light_df = df.copy()
light_df = light_df.sample(n = 10000, random_state = 42)

In [7]:
light_df['item'] = LabelEncoder().fit_transform(light_df['track_uri'])
light_df['user'] = LabelEncoder().fit_transform(light_df['playlist_uris'])
light_df['tt_score'] = light_df.apply(lambda x : tt_score(x), axis = 1)
light_df['playlist_mean_tt'] = light_df.groupby('user')['tt_score'].transform('mean')
light_df['label'] = light_df['tt_score']/light_df['playlist_mean_tt']
light_df['artists_names'] = light_df['artists_names'].astype(str)
light_df['name'] = light_df['name'].astype(str)

In [8]:
track_genre = light_df['artists_genres'].apply(ast.literal_eval).to_list()

all_track_genre = sorted(list(set(itertools.chain.from_iterable(track_genre))))
print(f'Total number of unique genres: {len(all_track_genre)}')

Total number of unique genres: 1016


In [9]:
light_df.head(3)

Unnamed: 0,track_uri,name,artists_names,popularity,album_type,playlist_uris,danceability,energy,key,loudness,...,artists_mean_followers,lyrics,n_playlist,tt_score,song_by,rating,playlist_mean_tt,item,user,label
64243,spotify:track:0K2WjMLZYr09LKwurGRYRE,Theme From New York New York,Frank Sinatra,62,compilation,spotify:playlist:2jogTm047kymef0a52SbjW,0.312,0.497,2,-6.95,...,6195935.0,"Start spreading the news, I'm leaving today I...",2,4.199985,Theme From New York New York by Frank Sinatra,1.161704,3.817785,395,259,1.10011
35778,spotify:track:2Umhe37twJAgIZgYRkql6U,We Nah Listen,Various Artists,16,compilation,spotify:playlist:0mPufH8QFp0N8TEjRhZF3b,0.623,0.52,4,-6.995,...,1873341.0,,1,3.783177,We Nah Listen by Various Artists,0.823727,4.818398,2985,70,0.785153
48452,spotify:track:4c9MCYlfiTkfcJYlvLPt0M,Adi Shakti,Matthew Schoening,42,album,spotify:playlist:1NacC68KsHTOlkAUZiM4av,0.513,0.184,9,-15.497,...,23238.33,,1,1.61775,Adi Shakti by Matthew Schoening,1.138389,1.062285,5416,128,1.522897


In [10]:
playlist_df = pd.read_csv('datasets/spot_playlists_mean_numeric_data.csv')
playlist_df.head(3)

Unnamed: 0,playlist_uris,tracks_count,mean_popularity,mean_danceability,mean_energy,mean_key,mean_loudness,mean_mode,mean_speechiness,mean_acousticness,mean_instrumentalness,mean_liveness,mean_valence,mean_tempo,mean_duration_ms,mean_time_signature,mean_release_year,artists_mean_popularities,artists_mean_followers
0,spotify:playlist:57uLNiwAaVFJxEMkSq0GeQ,100,66.39,0.73277,0.64473,5.29,-5.93997,0.44,0.180749,0.169474,0.003714,0.193195,0.483453,127.86138,191024.08,4.0,2017.93,78.345,14771290.0
1,spotify:playlist:37i9dQZF1DWUMIjnZuaulx,100,35.27,0.44872,0.6873,4.38,-6.19532,0.78,0.092676,0.223017,0.000874,0.276295,0.438595,119.41352,310215.35,3.81,2021.98,38.043333,230718.9
2,spotify:playlist:0tkRbaSwTy9lwAw66vCCIq,100,31.9,0.51413,0.79106,5.3,-6.82274,0.67,0.052334,0.089875,0.04026,0.21514,0.60894,126.28446,231490.64,3.99,2001.05,62.52,3702412.0


In [11]:
merged_df = light_df.merge(playlist_df, left_on='playlist_uris', right_on='playlist_uris', how='outer')

In [12]:
all_playlist_track_count = sorted(list(set(merged_df['tracks_count'])))

In [13]:
print(f'length of unique track count: {len(all_playlist_track_count)}')

length of unique track count: 100


In [14]:
column_name_mapping = {'playlist_uris': 'userID', 'track_uri': 'itemID'}

# Use the rename method to rename columns
merged_df.rename(columns=column_name_mapping, inplace=True)
merged_df = merged_df[['userID','itemID','artists_genres','tracks_count','label']]

In [15]:
dataset = Dataset()
dataset.fit(merged_df['userID'], 
            merged_df['itemID'], 
            item_features=all_track_genre,
            user_features=all_playlist_track_count)

In [16]:
item_features = dataset.build_item_features((x, y) for x,y in zip(merged_df.itemID, track_genre))


In [17]:
user_features = dataset.build_user_features((x, [y]) for x,y in zip(merged_df.userID, merged_df['tracks_count']))


In [18]:
merged_df.head(2)

Unnamed: 0,userID,itemID,artists_genres,tracks_count,label
0,spotify:playlist:2jogTm047kymef0a52SbjW,spotify:track:0K2WjMLZYr09LKwurGRYRE,"['easy', 'standards', 'adult', 'lounge', 'list...",100,1.10011
1,spotify:playlist:2jogTm047kymef0a52SbjW,spotify:track:4dt6XKr0xKdPzjFhwB8dBm,"['easy', 'standards', 'adult', 'lounge', 'list...",100,1.215333


In [19]:
interactions, weights = dataset.build_interactions(merged_df.loc[:, ['userID','itemID','label']].values)

train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, 
    test_percentage=0.2,
    random_state=np.random.RandomState(42)
)

In [20]:
# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.2
# model learning rate
LEARNING_RATE = 0.05
# no of latent factors
NO_COMPONENTS = 5
# no of epochs to fit model
NO_EPOCHS = 10
# no of threads to fit model
NO_THREADS = 8
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

In [21]:
model = LightFM(loss='logistic', 
                 no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE, 
                 item_alpha=ITEM_ALPHA,
                 user_alpha=USER_ALPHA,
                 random_state=np.random.RandomState(SEED),
                 k=5, n=5
                )

In [22]:
model.fit(interactions=train_interactions,
           user_features=user_features,
           item_features=item_features,
           epochs=NO_EPOCHS
           )

<lightfm.lightfm.LightFM at 0x2573fa8eeb0>

Prepare model evaluation data

In [23]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions.row, 
    interactions.col, 
    interactions.data, 
    random_state=np.random.RandomState(SEED)
)

uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

In [24]:
cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)

In [25]:
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics, prepare_test_df, prepare_all_predictions,
    compare_metric, similar_users, similar_items)

In [26]:
with Timer() as test_time:
    test_df = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.")  

Took 0.9 seconds for prepare and predict test data.


In [27]:
with Timer() as test_time:
    all_predictions2 = prepare_all_predictions(merged_df, uid_map, iid_map, 
                                              interactions=train_interactions,
                                               user_features=user_features,
                                               item_features=item_features,
                                               model=model,
                                               num_threads=4)

print(f"Took {test_time.interval:.1f} seconds for prepare and predict all data.")

Took 5534.8 seconds for prepare and predict all data.


In [28]:
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k


In [29]:
eval_precision2 = precision_at_k(rating_true=test_df, 
                                rating_pred=all_predictions2, k=K)
eval_recall2 = recall_at_k(test_df, all_predictions2, k=K)

print(
    "\n------ Using both implicit and explicit ratings ------",
    f"Precision@K:\t{eval_precision2:.6f}",
    f"Recall@K:\t{eval_recall2:.6f}",
    sep='\n')


------ Using both implicit and explicit ratings ------
Precision@K:	0.058153
Recall@K:	0.580589
