In [70]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
# from lightfm import LightFM
from tqdm import tqdm
# from lightfm.evaluation import precision_at_k
# from sklearn.model_selection import LeavePOut
from sklearn.model_selection import train_test_split
from lightfm.data import Dataset
from scipy.sparse import save_npz

In [21]:
df = pd.read_csv('spotify_dataset.csv.zip', skiprows=1,
                 usecols = [0, 1, 2],
                 names=['user_id', 'artistname', 'trackname'],
                 on_bad_lines='skip')

In [22]:
df['artistname'].fillna('', inplace=True)
df['trackname'].fillna('', inplace=True)

df['song'] = df['artistname'] + '__' + df['trackname']

In [23]:
df.drop(columns=['artistname', 'trackname'], inplace = True)

In [24]:
df

Unnamed: 0,user_id,song
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello__(The Angels Wanna Wear My) Red...
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions__(What's So F...
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page__7 Years Too Late
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions__Accidents Wi...
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello__Alison
...,...,...
12901974,2302bf9c64dc63d88a750215ed187f2c,Mötley Crüe__Wild Side
12901975,2302bf9c64dc63d88a750215ed187f2c,John Lennon__Woman
12901976,2302bf9c64dc63d88a750215ed187f2c,Tom Petty__You Don't Know How It Feels
12901977,2302bf9c64dc63d88a750215ed187f2c,Tom Petty__You Wreck Me


In [25]:
songs = df['song'].value_counts()

In [26]:
# df[df['songs'] == songs.index[:9999]]

In [27]:
most_popular = set(songs[:10000].index)

In [28]:
df = df[df['song'].isin(most_popular)]

In [29]:
df

Unnamed: 0,user_id,song
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello__Alison
13,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House__Don't Dream It's Over
15,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House__Fall At Your Feet
25,9cc0cfd4d7d7885102480dd99e7a90d6,Joshua Radin__I'd Rather Be With You [Radio Edit]
33,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney__Live And Let Die
...,...,...
12901972,2302bf9c64dc63d88a750215ed187f2c,U2__When Love Comes To Town
12901973,2302bf9c64dc63d88a750215ed187f2c,Stone Temple Pilots__Wicked Garden
12901974,2302bf9c64dc63d88a750215ed187f2c,Mötley Crüe__Wild Side
12901976,2302bf9c64dc63d88a750215ed187f2c,Tom Petty__You Don't Know How It Feels


In [52]:
len(set(df['user_id']))

14985

In [36]:
train, test = train_test_split(df, test_size=0.15)

In [44]:
len(set(train['user_id'].unique())-set(test['user_id'].unique()))

1288

In [50]:
len(set(test['user_id'])&set(train['user_id']))

13619

In [57]:
test[test['user_id'].isin(set(train['user_id']))]

Unnamed: 0,user_id,song
7861575,bdafbf940f0e1f283feab1e39d1199af,Coldplay__Christmas Lights
6382075,05cd5a5d37f5c53c277e471177fce94f,MGMT__Electric Feel
12187177,2be3f4ae729911f567f8409bd4158499,Sara Bareilles__Love Song
6715534,b4eb611886dbb3272b07b449a681b6c9,Beck__Loser
3934572,f853ce21c490a78a5ac6fd60d431b88e,Supertramp__The Logical Song - 2010 Remastered
...,...,...
11278664,8eae7b06f7ea4c19b82ba1b82fe6d130,The Black Keys__These Days
8796487,e7e5985a7a6d9d8404614fa4fdfe2418,Drake__Marvins Room
8756857,729872f82d51739a9c898f113787ce52,The 1975__Girls
11261024,ff07dd0a6facfba39bca491ec9dd5068,Sex Pistols__God Save The Queen


In [59]:
test = test[test['user_id'].isin(set(train['user_id']))]

In [60]:
train.to_csv("train.csv", index = False)
test.to_csv("test.csv", index = False)

In [63]:
train_dataset = Dataset()
train_dataset.fit(train['user_id'],
                  train['song'])


test_dataset = Dataset()
test_dataset.fit(test['user_id'],
                 test['song'])

In [64]:
num_users, num_items = train_dataset.interactions_shape()
print('Num train users: {}, num_train_items {}.'.format(num_users, num_items))

num_users, num_items = test_dataset.interactions_shape()
print('Num test users: {}, num_test_items {}.'.format(num_users, num_items))

Num train users: 14907, num_train_items 10000.
Num test users: 13619, num_test_items 10000.


In [65]:
def df_to_tuple_iterator(df):
    return zip(*df.values.T)

In [67]:
train_mat, train_mat_weights = train_dataset.build_interactions(df_to_tuple_iterator(train[['user_id',
                                                                                            'song']]))
train_mat

<14907x10000 sparse matrix of type '<class 'numpy.int32'>'
	with 2270280 stored elements in COOrdinate format>

In [68]:
test_mat, test_mat_weights = test_dataset.build_interactions(df_to_tuple_iterator(test[['user_id',
                                                                                        'song']]))
test_mat

<13619x10000 sparse matrix of type '<class 'numpy.int32'>'
	with 400555 stored elements in COOrdinate format>

In [71]:
save_npz('train_sparse.npz', train_mat)
save_npz('test_sparse.npz', test_mat)