In [None]:

# ## The first step is to transform all of the .hdf5 files into a usable dataframe. This first cell iterates over the directory and places the data into a Dask dataframe.
# 

# In[1]:


import time
import dask.dataframe as dd
import os
import sys
import time
import glob
import datetime
import sqlite3
import numpy as np 
import pandas as pd
# we define this very useful function to iterate the files
def list_all_files(filelist,basedir,func=lambda x: x,ext='.h5'):
    """
    From a base directory, go through all subdirectories,
    find all files with the given extension, apply the
    given function 'func' to all of them.
    If no 'func' is passed, we do nothing except counting.
    INPUT
       basedir  - base directory of the dataset
       func     - function to apply to all filenames
       ext      - extension, .h5 by default
    RETURN
       number of files
    """
    cnt = 0
    # iterate over all files in all subdirectories
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        # add files to list
        filelist.extend(files)
             
    return filelist


start = time.time()
mylist = list()
songdir = 'pathto/MillionSongSubset' # to run this locally, you must download
songlist = list_all_files(mylist, songdir)
del songlist[:3]
print(songlist)
df = dd.read_hdf(songlist,'/metadata/songs')
df2 = dd.read_hdf(songlist, '/analysis/songs')
df4 = dd.read_hdf(songlist, '/musicbrainz/songs')
df3 = dd.multi.concat([df,df2,df4], axis=1)
df3.head()
end = time.time()


## Time - 2.5 minutes - scale to MSD... ~ 4 hours. Not bad


start = time.time()
df =df3.compute()
end = time.time()
print(end-start)
#time - 7.5 minutes - Scale to MSD... ~ 12 hours. Wish this was faster.


# ## Now, we have to use Spotipy to fill in some missing audio analysis features.

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
cid = 'your_client_id'
secret = 'your_client_secret'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager
=
client_credentials_manager)


import time
def get_sp_track_id(this_track):
    artist = this_track['artist_name']
    song = this_track['title']
    try:
        track_id = sp.search(q='artist:' + artist + ' track:' + song,type='track')
        return track_id['tracks']['items'][0]['id']
    except:
        print('no such track found')
        return None
start_time = time.time()

# Apply the function to each row of the DataFrame
sp_track_ids = []
for i, row in df.iterrows():
    sp_track_id = get_sp_track_id(row)
    sp_track_ids.append(sp_track_id)
df['sp_track_id'] = sp_track_ids

end_time = time.time()

print("Time elapsed: ", end_time - start_time, "seconds")
#time - 24 minutes - Scale to MSD... ~ 40 hours T_T


dropped_df = df.drop(['danceability','energy','loudness','mode','tempo','key','key_confidence','time_signature'],axis=1)
dropped_df = dropped_df[~dropped_df['sp_track_id'].isna()]
import numpy as np
import time
start = time.time()
feature_list = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature']
features_list = []
track_ids = dropped_df['sp_track_id'] #column of all track_ids
for x in track_ids: # for each track id
    results = sp.audio_features(x) #get its audio features
    feature_values = [] #make an empty list
    try:
        for y in feature_list: #for each of the features i want
            feature_values.append(results[0][y]) #add the key value to the empty list
    except:
        print("No audio features for this track")
    features_list.append(feature_values)
features_df = pd.DataFrame(features_list,columns= feature_list)

end = time.time()
print(end-start)
# Time -  11 minutes, scaled to MSD... ~ 18 hours

#Total time for all operations - 


full_df = pd.merge(features_df, dropped_df, on=dropped_df.index)


full_df.head()

import pandas as pd
df = pd.read_csv('pathto/finished_dataset.csv')
styles = pd.read_csv('pathto/Styles (1).csv',header=None)

styles.rename(columns = {0:'track_id'}, inplace = True)
styles.rename(columns = {1:'Style'},inplace=True)
styles.head()
df_merged = pd.merge(df,styles, on = 'track_id',how='left').fillna(0)
df_merged


pd.DataFrame(df_merged.Style.unique())


#Let's set up a smaller df and preprocess it for a random forest genre classification model.
import numpy as np
df_with_genre = df_merged[df_merged['Style']!=0]
df_with_genre = df_with_genre[['title','artist_name','track_id','song_id','sp_track_id','Style','danceability','energy','key','loudness','speechiness','acousticness','instrumentalness','valence','tempo','time_signature','year']]


df_with_genre['year'].replace(0, df_with_genre['year'].median(), inplace=True)
df_with_genre['key'].replace(0, df_with_genre['key'].median(), inplace=True)
df_with_genre['instrumentalness'].replace(0, df_with_genre['instrumentalness'].median(), inplace=True)
df_with_genre.replace(0, np.nan, inplace=True)
df_rf_dropped=df_with_genre.dropna()
kMeansFrame = df_rf_dropped[['track_id','Style','danceability','energy','key','loudness','speechiness','acousticness','instrumentalness','valence','tempo','time_signature','year']]


df_rf_dropped.head()


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

x = df_rf_dropped.drop(['Style','title','artist_name','track_id','song_id','sp_track_id'],axis=1)
y = df_rf_dropped['Style']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

##this is pretty bad.. maybe it is not possible to detect genre from audio features like this. Let's try clustering instead, that may provide some better recommendations.


from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder # for encoding categorical features from strings to number arrays

km = KMeans(n_clusters=10, random_state=123)
le = LabelEncoder()
kMeansFrame.Style = le.fit_transform(kMeansFrame.Style)
#normalize the data
kMeansNorm = kMeansFrame.drop('track_id',axis=1)
for col in kMeansNorm.columns:
    kMeansNorm[col] = kMeansNorm[col]  / kMeansNorm[col].abs().max()
display(kMeansNorm)

#Let's tune our model
import matplotlib.pyplot as plt
wcss = [0] * 23
for i in range(2,25):
    kModel = KMeans(n_clusters=i, random_state=123)
    kModel.fit_predict(kMeansNorm)
    wcss[i-2]=kModel.inertia_
x = list(range(2,25))
plt.plot(x,wcss)
kModel = KMeans(n_clusters=10, random_state=123)
clusters = kModel.fit_predict(kMeansNorm)
kMeansFrame['cluster'] = clusters
kMeansFrame


#Looks like 10 is good, so we will keep it at that.
df_with_clusters = pd.merge(df_merged,kMeansFrame[['track_id','cluster']], on = 'track_id',how='left')
df_with_clusters


userScores = pd.read_csv('pathto/UserScores.csv',header=None)
#rename columns
userScores.rename(columns = {0:'user_id'}, inplace = True)
userScores.rename(columns = {1:'song_id'},inplace=True)
userScores.rename(columns = {2:'plays'},inplace=True)
userScores.head()
#get array of unique users
uniqueUser = pd.DataFrame(userScores.user_id.unique())
uniqueSongs = pd.DataFrame(userScores.song_id.unique())
uniqueUser
uniqueUser.rename(columns = {0:'user_id'}, inplace = True)
uniqueSongs.rename(columns = {0:'song_id'}, inplace = True)


subsetUserList = pd.DataFrame(userScores['user_id'].value_counts(ascending=True))
subsetUserList_over20 = subsetUserList[subsetUserList['user_id']>=20]
subsetUserList_over20.rename(columns = {'user_id':'count'}, inplace = True)
subsetUserList_over20['user_id']=subsetUserList_over20.index


subsetUserList_over20


subsetUniqueUsers = uniqueUser[uniqueUser['user_id'].isin(subsetUserList_over20['user_id'])]
subsetUniqueUsers
subsetUserScores =  userScores[userScores['user_id'].isin(subsetUserList_over20['user_id'])]
subsetUserScores


userMatrix = subsetUserScores.pivot_table(index='user_id', columns='song_id', values='plays',fill_value = 0)


from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
kNNMatrix = csr_matrix(userMatrix.values)
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(kNNMatrix)


collab_songRecFrame = pd.DataFrame(subsetUniqueUsers['user_id'])
distances, indices = model_knn.kneighbors(userMatrix.iloc[0,:].values.reshape(1,-1), n_neighbors=6)
nearNeighbors_idx = pd.DataFrame(indices)
for i, user_indices in enumerate(indices):
    print(f"User {i} nearest neighbors:", user_indices)
nearNeighbors_idx.drop(columns=nearNeighbors_idx.columns[0], axis=1,  inplace=True)
nn_idx_t = nearNeighbors_idx.T
nn_idx_t.rename(columns = {0:'user_idx'}, inplace = True)

sr = []
# for idx in nn_idx_t
for i in nn_idx_t['user_idx']:
    nn1 = userScores[userScores['user_id']==userMatrix.iloc[i,:].name]
    user = userScores[userScores['user_id']==userMatrix.iloc[0,:].name]
    nn_novel = nn1[~nn1['song_id'].isin(user['song_id'])]
    nn_novel.sort_values('plays',ascending=False)
    sr.append(nn_novel['song_id'].head(1))

sr


all_sr = []
userlist= []
import time
start = time.time()
for user in range(0,10):
    
    distances, indices = model_knn.kneighbors(userMatrix.iloc[user,:].values.reshape(1,-1), n_neighbors=6)# get indices for 6 NN
    nearNeighbors_idx = pd.DataFrame(indices) #make a dataframe list of the indices
    nearNeighbors_idx.drop(columns=nearNeighbors_idx.columns[0], axis=1,  inplace=True) #drop the first one (user themselves)
    nn_idx_t = nearNeighbors_idx.T #transpose
    nn_idx_t.rename(columns = {0:'user_idx'}, inplace = True) #rename column
    sr = []
    for i in nn_idx_t['user_idx']: #for 5 nearest users
        nn1 = userScores[userScores['user_id']==userMatrix.iloc[i,:].name] # get rows from userScores at index i
        user1 = userScores[userScores['user_id']==userMatrix.iloc[user,:].name] # get rows from userScores at 
        nn_novel = nn1[~nn1['song_id'].isin(user1['song_id'])]
        nn_novel.sort_values('plays',ascending=False)
        sr.append(nn_novel['song_id'].iloc[0])
    all_sr.append(sr)
    userlist.append(userMatrix.iloc[user,:].name)
end= time.time()
print(end-start)

songRecs = pd.DataFrame(all_sr)
songRecs['users'] = userlist
songRecs['song0'] = df_rf_dropped[df_rf_dropped['song_id'].isin(songRecs[0])].title
songRecs['song1'] = df_rf_dropped[df_rf_dropped['song_id'].isin(songRecs[1])].title
songRecs['song2'] = df_rf_dropped[df_rf_dropped['song_id'].isin(songRecs[2])].title
songRecs['song3'] = df_rf_dropped[df_rf_dropped['song_id'].isin(songRecs[3])].title
songRecs['song4'] = df_rf_dropped[df_rf_dropped['song_id'].isin(songRecs[4])].title
#df_match = df_rf_dropped.drop(df_rf_dropped['song_id'].isin(songRecs[0]))
songRecs