### Model Preparation

In [101]:
# importing from libraries
import config
import spotipy
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from pyjarowinkler import distance
from IPython.display import IFrame
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.c_id,
                                                           client_secret= config.c_se))

In [102]:
# loading top 100 chart (top_chart_df) and song database (upsampled_df)
top_chart_df = pd.read_csv('top_chart_df.csv')
upsampled_df = pd.read_csv('upsampled_df.csv')

In [103]:
# function for loading files with pickle
def load(filename = "filename.pickle"): 
    try: 
        with open(filename, "rb") as f: 
            return pickle.load(f) 
        
    except FileNotFoundError: 
        print("File not found!") 

In [104]:
# loading scaler
scaler = load("Model/scaler.pickle")

In [105]:
# loading kmeans
kmeans = load("Model/kmeans.pickle")

### Function For Song Recommender

In [106]:
# function for recommending similar songs
def song_recommender():
    user_input = input("Please input a song: ")   ### asking user for an input
    song_list_cleaned = [each_string.replace(" ","").lower() for each_string in list(top_chart_df.title)]   ### getting generic song titles by converting to lowercase and removing space
    artist_list_cleaned = [each_string.replace(" ","").lower() for each_string in list(top_chart_df.artist)]   ### getting generic artist names by converting to lowercase and removing space
    user_input_cleaned = user_input.replace(" ", "").lower()   ### getting generic user input of song by converting to lowercase and removing space
    
    duplicate_check = top_chart_df[top_chart_df.title.duplicated(keep=False)]   ### checking for tracks in top chart having the same title
    duplicate_list = [each_string.replace(" ","").lower() for each_string in list(duplicate_check.title)]   ### getting generic duplicate song titles by converting to lowercase and removing space
    
    similarity_count = 0
    similarity_song_list = []
    similarity_score_list = []
    for i in range(0, len(top_chart_df.title)):   ### computing the text similarity between 0 and 1 based on the cleaned user input with each cleaned song title from the top chart
        similarity_score = distance.get_jaro_distance(user_input_cleaned, song_list_cleaned[i], winkler=True, scaling=0.1)
        similarity_song_list.append(top_chart_df.title[i])
        similarity_score_list.append(similarity_score)
        if similarity_score > 0.9:   ### a song is matched with the user input of song if their similarity score is larger than 0.9
            similarity_count = similarity_count+1
        else:
            similarity_count = similarity_count   ### if no song from the top chart has a similarity score more than 0.9, then user input of song is not in top chart and similarity count will be zero
            
    similarity_df = pd.DataFrame({"title":similarity_song_list, "similarity":similarity_score_list})   ### saving the similarity score results in a dataframe
    
    user_input_corrected = similarity_df.loc[similarity_df.similarity == max(similarity_df.similarity), "title"].values[0]   ### correcting the user input of song with the most similar song title from the top chart, given similarity score is larger than 0.9
    user_input_corrected_and_cleaned = user_input_corrected.replace(" ","").lower()   ### getting generic corrected user input of song by converting to lowercase and removing space
    
    ## when there is a song match from the top chart
    if similarity_count>0:
        while True:
            try:
                suggestion = top_chart_df[top_chart_df["title"]!=user_input_corrected].sample(1)   ### looking for another track in the top chart as track suggestion
                suggestion_title = suggestion.iloc[0][0]   ### locating the track suggestion title
                suggestion_artist = suggestion.iloc[0][1]   ### locating the track suggestion artist
                suggestion_id = sp.search(q=suggestion_title+" "+suggestion_artist ,limit=1,market="GB")['tracks']['items'][0]['id']   ### looking for the spotify id of the track suggestion
            except:
                continue
            else:
                break
        
        while True:
            if user_input_corrected_and_cleaned in duplicate_list:   ### checking if there are duplicate titles in the top chart for the user input of song
                print(top_chart_df[top_chart_df.title == user_input_corrected])   ### displaying the duplicate song titles and its artists
                print('\n')
                artist_user_input = input("Which song do you mean? Please input the artist: ")   ### asking user to input the artist
                artist_user_input_cleaned = artist_user_input.replace(" ", "").lower()   ### getting generic user input of song by converting to lowercase and removing space

                suggestion = top_chart_df[(top_chart_df["title"]!=user_input_corrected) | (top_chart_df["artist"]!=artist_user_input)].sample(1)   ### looking for another track in the top chart as track suggestion
                suggestion_title = suggestion.iloc[0][0]   ### locating the track suggestion title
                suggestion_artist = suggestion.iloc[0][1]   ### locating the track suggestion artist
                try:
                    suggestion_id = sp.search(q=suggestion_title+" "+suggestion_artist ,limit=1,market="GB")['tracks']['items'][0]['id']   ### looking for the spotify id of the track suggestion
                    if artist_user_input_cleaned in artist_list_cleaned:
                        break
                except:
                    continue
            else:
                break

        print("\nYour song is in the top 100 chart! Here is another one from the top chart for you.")
        return IFrame(src="https://open.spotify.com/embed/track/"+suggestion_id, width="420", height="80", frameborder="0", allowtransparency="true", allow="encrypted-media",)   ### displaying the embedded spotify player for the recommended song
    
    ## when there is no song match from the top chart
    else:
        print('The song you input is not in the top 100 chart.')
    
        result = sp.search(q=user_input ,limit=5,market="GB")   ### searching with user input of song on spotify and returning top 5 search results
        
        while True:
            if len(result['tracks']['items']) == 0:   ### when there is no result for the search of song
                user_input = input('There is no result for your input, please try again: ')   ### asking user for a new input
                result = sp.search(q=user_input ,limit=5,market="GB")   ### searching with user input of song on spotify and returning top 5 search results
            else:
                for i in range(0,len(result['tracks']['items'])):   ### displaying the top 5 search results
                    print(str(i+1)+". "+(result['tracks']['items'][i]['artists'][0]['name'])+" ("+result['tracks']['items'][i]['name']+")")
                print('\n0. The song I want is not on the list.')
                try:
                    number_input = int(input("Which song and artists do you mean? Please input the number: "))   ### asking user to choose from the top search results
                    print('\n')
                except:
                    print('\nPlease input a vaild number.')
                    continue
                else:
                    if int(number_input) == 0:
                        user_input = input('Please input a song again, artist name can be included for a better search result: ')   ### asking user for a new input
                        result = sp.search(q=user_input ,limit=5,market="GB")   ### searching with user input of song on spotify and returning top 5 search results
                        continue
                    elif int(number_input) in range(1, len(result['tracks']['items'])+1):
                        break
                    else:
                        print('Please input a vaild number.')
                        continue
            
        new_input_uri = result['tracks']['items'][number_input-1]['uri']   ### locating the track based on user input
        my_dict = sp.audio_features(new_input_uri)[0]   ### getting the audio features of the chosen track
        my_dict_new = { key: [my_dict[key]] for key in list(my_dict.keys()) }   ### converting the extracted audio features for dataframe format
        audio_feature_df = pd.DataFrame(my_dict_new)   ### saving the extracted audio features in a dataframe

        audio_feature_df.drop(columns=['type','uri','track_href','analysis_url','duration_ms','time_signature','id'],inplace=True)   ### dropping irrelvant columns from audio feature dataframe

        audio_feature_tf_df = scaler.transform(audio_feature_df)   ### scaling the audio feature data with the scaler from the model (refer to Jupyter Notebook 2_Clustering) 
        user_input_cluster = kmeans.predict(audio_feature_tf_df)[0]   ### predicting the cluster of the user in put of song based on the K-means model (refer to Jupyter Notebook 2_Clustering)

        track_id = upsampled_df[upsampled_df['cluster']==user_input_cluster].sample(1)['track_id'].values[0]   ### looking for the track id from the song database, sampling from tracks in the same cluster as the predicted cluster
        print('\n')
        print('Here is another similar song for you.')
        return IFrame(src="https://open.spotify.com/embed/track/"+track_id, width="420", height="80", frameborder="0", allowtransparency="true", allow="encrypted-media",)   ### displaying the embedded spotify player for the recommended song

### Song Recommender

Mechanism: if the song input is on the current Billboard top 100 list, another song from the top chart will be recommended. Otherwise, a similar song will be recommended. Try it out!

In [125]:
song_recommender()

Please input a song: freed from desire
The song you input is not in the top 100 chart.
1. Gala (Freed From Desire)
2. Fun[k]House (Freed from Desire - Club Mix)
3. Drenchill (Freed from Desire (feat. Indiiana))
4. Gala (Freed from Desire - Xtm Remix Edit)
5. Madism (Freed From Desire)

0. The song I want is not on the list.
Which song and artists do you mean? Please input the number: 1




Here is another similar song for you.




In [108]:
upsampled_df[upsampled_df['title']=="I Feel Like I'm Drowning"]

Unnamed: 0,title,artist,track_id,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,cluster
2969,I Feel Like I'm Drowning,Two Feet,3aauaXWRgwCMoykMbI0Jq1,rock,0.717,0.577,8,-5.436,1,0.0491,0.708,0.00428,0.0899,0.235,104.927,12


In [99]:
for i in range(1, max(upsampled_df['cluster'])):
    genre_count = upsampled_df[upsampled_df['cluster']==i].genre.value_counts()
    print(genre_count)

jazz         164
classical      6
house          1
pop            1
metal          1
Name: genre, dtype: int64
jazz         84
rnb          22
classical    21
pop          19
rock          6
country       6
metal         4
Name: genre, dtype: int64
house     150
techno     66
dance      49
jazz       18
pop         2
rock        2
metal       1
Name: genre, dtype: int64
pop        104
house      101
country     80
kpop        76
rock        57
dance       57
techno      42
hiphop      30
rnb         24
metal        8
jazz         1
Name: genre, dtype: int64
rnb       61
hiphop    52
kpop      26
pop       16
house     10
techno     6
jazz       6
rock       4
dance      3
Name: genre, dtype: int64
metal      106
rock        51
pop         11
rnb          6
dance        4
jazz         3
kpop         3
techno       2
country      1
Name: genre, dtype: int64
dance      96
pop        83
house      79
kpop       74
rnb        68
hiphop     34
rock       33
techno     23
country    13
jazz  

In [98]:
max(upsampled_df['cluster'])

36

In [100]:
upsampled_df.cluster.value_counts()

4     580
18    568
7     514
14    441
21    433
35    421
28    357
19    348
11    333
13    328
8     327
20    322
9     318
0     313
15    300
23    293
3     288
22    280
32    278
26    276
36    263
25    262
16    235
17    227
12    225
27    224
31    205
30    193
34    189
6     187
5     184
1     173
10    169
2     162
24    150
29     96
33     75
Name: cluster, dtype: int64