In [1]:
# Import 
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Filter Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read Data
song_df = pd.read_csv('feature-data.csv')
song_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,artist_pop,genres,track_pop,pos,artist_name,artist_uri,track_name,album_uri,duration_ms_y,album_name
0,0.675,0.927,1,-5.494,0,0.07,0.0136,0.00969,0.124,0.968,...,55,pop_r&b r&b teen_pop,0,100,SoMo,spotify:artist:5AtWO8IEnjhRLzaVVQxFUs,Letters,spotify:album:1UGKxHlPsnPJd4gXhxJCXC,163977,My Life
1,0.832,0.87,8,-4.2,0,0.167,0.267,0.0,0.412,0.566,...,80,dance_pop,69,101,Akon,spotify:artist:0z4gvV4rjIZ9wHck67ucSV,Right Now (Na Na Na),spotify:album:7EZP6OSbWvMZTrEmlBt1HX,240746,Freedom
2,0.635,0.808,5,-5.961,0,0.0438,0.00665,0.0,0.134,0.669,...,80,dance_pop,18,102,Akon,spotify:artist:0z4gvV4rjIZ9wHck67ucSV,Real Niggas,spotify:album:5OxggY6YGvJiYWskRN5T7W,229693,Konvict Allstars
3,0.582,0.729,3,-8.141,0,0.62,0.314,0.0,0.418,0.554,...,80,dance_pop,27,103,Akon,spotify:artist:0z4gvV4rjIZ9wHck67ucSV,Get By,spotify:album:7BuPO7yqJYGScpH2Qq6ZSu,204589,KonKrete Jungle
4,0.857,0.781,7,-4.256,0,0.104,0.13,0.0,0.402,0.835,...,80,dance_pop,0,104,Akon,spotify:artist:0z4gvV4rjIZ9wHck67ucSV,Troublemaker,spotify:album:2EPsQyatLP7uIoT7sOEaWm,237173,Freedom


In [4]:
song_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'track_uri', 'track_href', 'analysis_url',
       'duration_ms_x', 'time_signature', 'artist_pop', 'genres', 'track_pop',
       'pos', 'artist_name', 'artist_uri', 'track_name', 'album_uri',
       'duration_ms_y', 'album_name'],
      dtype='object')

In [5]:
# Removing Redundant Columns
def select_cols(df):
    '''
    Select useful columns
    '''
    
    return df[['artist_name','id', 'track_name', 'danceability', 'energy', 'key',
             'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
             'liveness', 'valence', 'tempo', 'artist_pop', 'genres', 'track_pop']]
song_df = select_cols(song_df)

In [6]:
# Creating genre list
def genre_list(df):
    '''
    Preprocess genre data
    '''
    
    return df['genres'].apply(lambda x: list(x.split()))
song_df['genre_list'] = genre_list(song_df)

In [7]:
# Create Features
def get_subjectivity(text):
    '''
    Returns subjectivity of text
    Input:
    text (str): String to be analyzed
    
    Output:
    str
    '''
    
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    '''
    Returns polarity of text
    Input:
    text (str): String to be analyzed
    
    Output:
    str
    '''
    
    return TextBlob(text).sentiment.polarity

def get_analysis(x, task = 'polarity'):
    '''
    Categorize scores
    '''
    
    if task == 'subjectivity':
        if x > 1/3:
            return 'high'
        elif x < 1/3:
            return 'low'
        else:
            return 'medium'
    else:
        if x > 0:
            return 'positive'
        elif x < 0:
            return 'negative'
        else:
            return 'neutral'

In [8]:
def sentiment_analysis(df, col):
    '''
    Creates sentiment columns
    Input: 
    df (pandas dataframe): Spotify Dataframe
    col (str): Column to be processed
        
    Output: 
    df (pandas dataframe): Sentiment features
    '''
    
    df['subjectivity'] = df[col].apply(lambda x: get_analysis(get_subjectivity(x), 'subjectivity'))
    df['polarity'] = df[col].apply(lambda x: get_analysis(get_polarity(x)))
    return df

In [9]:
# Results
sentiment_df = sentiment_analysis(song_df, 'track_name')
sentiment_df.head()

Unnamed: 0,artist_name,id,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artist_pop,genres,track_pop,genre_list,subjectivity,polarity
0,SoMo,0WylixnDwAtnmK8CxOuldn,Letters,0.675,0.927,1,-5.494,0,0.07,0.0136,0.00969,0.124,0.968,149.994,55,pop_r&b r&b teen_pop,0,"[pop_r&b, r&b, teen_pop]",low,neutral
1,Akon,1ARJhjuI6TNYZCxYygFQ4F,Right Now (Na Na Na),0.832,0.87,8,-4.2,0,0.167,0.267,0.0,0.412,0.566,138.003,80,dance_pop,69,[dance_pop],high,positive
2,Akon,41zw0MQZTYTdS6QhutwA3v,Real Niggas,0.635,0.808,5,-5.961,0,0.0438,0.00665,0.0,0.134,0.669,145.024,80,dance_pop,18,[dance_pop],low,positive
3,Akon,3VzZRoYHpQiU8AJZ0Lz4ME,Get By,0.582,0.729,3,-8.141,0,0.62,0.314,0.0,0.418,0.554,80.196,80,dance_pop,27,[dance_pop],low,neutral
4,Akon,6CaMlPv0MvfRsHE1cQAHEZ,Troublemaker,0.857,0.781,7,-4.256,0,0.104,0.13,0.0,0.402,0.835,122.059,80,dance_pop,0,[dance_pop],low,neutral


In [10]:
# One-hot Encoder
def ohe(df, col, new_name):
    ''' 
    Create One Hot Encoded features of a specific column
    ---
    Input: 
    df (pandas dataframe): Dataframe
    col (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    dummies (pandas dataframe): One-hot encoded features 
    '''
    
    dummies = pd.get_dummies(df[col])
    columns = dummies.columns
    dummies.columns = [new_name + '|' + str(name) for name in columns]
    return dummies

In [11]:
subject_ohe = ohe(sentiment_df, 'subjectivity', 'subject')
subject_ohe.columns

Index(['subject|high', 'subject|low', 'subject|medium'], dtype='object')

In [12]:
# Vectorize column
def vectorize(df, col, new_name):
    ''' 
    Create TFIDF-Vectorized features of a specific column
    ---
    Input: 
    df (pandas dataframe): Dataframe
    col (str): Column to be processed
    new_name (str): new column name to be used
        
    Output: 
    vector_df (pandas dataframe): TFIDF features 
    '''
    
    tfidf = TfidfVectorizer()
    vector = tfidf.fit_transform(df[col])
    vector_df = pd.DataFrame(vector.toarray())
    vector_df.columns = [new_name + '|' + i for i in tfidf.get_feature_names_out()]
    return vector_df

In [13]:
genre_tfidf = vectorize(sentiment_df, 'genres', 'genre')
genre_tfidf.iloc[0]

genre|432hz           0.0
genre|48g             0.0
genre|_brasileira     0.0
genre|_hip_hop        0.0
genre|_indie          0.0
                     ... 
genre|zenonesque      0.0
genre|zilizopendwa    0.0
genre|zolo            0.0
genre|zouk            0.0
genre|zouk_riddim     0.0
Name: 0, Length: 2131, dtype: float64

In [14]:
# Normalize
def normalize(column):
    ''' 
    Create Normalized features of a specific column
    ---
    Input: 
    column (pandas dataframe): Dataframe
   
    Output: 
    norm_feature (pandas dataframe): Normalized features 
    '''
    
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(column)
    norm_feature = pd.DataFrame(scaled, columns = column.columns)
    return norm_feature

In [15]:
norm_pop = normalize(sentiment_df[['artist_pop', 'track_pop']])

In [16]:
def preprocessing_pipeline(df):
    '''
    Preprocesses Spotify-dataset for recommendation system
    ---
    Input:
    df (pandas dataframe): Spotify-dataset DataFrame
    
    Output:
    final_df (pandas dataframe): Preprocessed DataFrame
    '''
    
    # Select Columns
    df = select_cols(df)
    
    # Float columns
    float_cols = df.dtypes[df.dtypes == 'float64'].index.values
    
    # Sentiment Analysis
    df = sentiment_analysis(df, 'track_name')
    
    # OHE
    subject_ohe = ohe(df, 'subjectivity', 'subject')
    polar_ohe = ohe(df, 'polarity', 'polar')
    mode_ohe = ohe(df, 'mode', 'mode')
    key_ohe = ohe(df, 'key', 'key')
    
    # TFIDF
    genre_vector = vectorize(df, 'genres', 'genre')
    
    # Normalization
    norm_pop = normalize(df[['artist_pop', 'track_pop']])
    norm_floats = normalize(df[float_cols])
    
    # Concatenate
    final_df = pd.concat([subject_ohe, polar_ohe, mode_ohe, key_ohe,
                          genre_vector, norm_pop, norm_floats],
                         axis = 1)
    
    # Give ID col
    final_df['id'] = df['id']
    
    return final_df

In [17]:
songs_db = preprocessing_pipeline(song_df)
songs_db.head()

Unnamed: 0,subject|high,subject|low,subject|medium,polar|negative,polar|neutral,polar|positive,mode|0,mode|1,key|0,key|1,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id
0,0,1,0,0,1,0,1,0,0,1,...,0.683198,0.927928,0.88099,0.072917,0.013655,0.009739,0.124498,0.970913,0.680838,0WylixnDwAtnmK8CxOuldn
1,1,0,0,0,0,1,1,0,0,0,...,0.842105,0.870871,0.901906,0.173958,0.268072,0.0,0.413655,0.567703,0.626409,1ARJhjuI6TNYZCxYygFQ4F
2,0,1,0,0,0,1,1,0,0,0,...,0.642713,0.808809,0.873442,0.045625,0.006677,0.0,0.134538,0.671013,0.658278,41zw0MQZTYTdS6QhutwA3v
3,0,1,0,0,1,0,1,0,0,0,...,0.589069,0.72973,0.838207,0.645833,0.315261,0.0,0.419679,0.555667,0.364018,3VzZRoYHpQiU8AJZ0Lz4ME
4,0,1,0,0,1,0,1,0,0,0,...,0.867409,0.781782,0.901001,0.108333,0.130522,0.0,0.403614,0.837513,0.554038,6CaMlPv0MvfRsHE1cQAHEZ


In [18]:
songs_db.to_csv('Feature-data-clean.csv', index = False)

In [19]:
# Test Playlist
test_playlist = pd.read_csv('test-playlist.csv')

In [20]:
def playlist_to_feature(complete_feature_set, playlist_df):
    '''
    Creates Feature Vector of the Playlist
    ---
    Input:
    complete_feature_set (pandas DataFrame): Song features data
    playlist_df (pandas DataFrame): Song features data in the playlist
    
    Output:
    in_playlist (pandas Series): Playlist Vector
    non_playlist (pandas DataFrame): Set of all the song features not in the playlist
    '''
    
    # Create a list of the ids of all the songs in the playlist
    ids = list(playlist_df['id'])
    
    # Find song features in the playlist
    in_playlist = complete_feature_set[complete_feature_set['id'].isin(ids)]
    # Remove those ids from the superset
    non_playlist = complete_feature_set[~complete_feature_set['id'].isin(ids)]
    
    # Drop the id
    in_playlist.drop(['id'], axis = 'columns', inplace = True)
    
    return in_playlist.sum(axis = 0), non_playlist

In [21]:
# Generate the features
playlist_vector, non_playlist_vector = playlist_to_feature(songs_db, test_playlist)

In [22]:
playlist_vector

subject|high        13.000000
subject|low         45.000000
subject|medium       0.000000
polar|negative       4.000000
polar|neutral       43.000000
                      ...    
acousticness         6.809764
instrumentalness     0.062629
liveness            10.034739
valence             33.382146
tempo               32.646813
Length: 2162, dtype: float64

In [23]:
non_playlist_vector.head()

Unnamed: 0,subject|high,subject|low,subject|medium,polar|negative,polar|neutral,polar|positive,mode|0,mode|1,key|0,key|1,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id
0,0,1,0,0,1,0,1,0,0,1,...,0.683198,0.927928,0.88099,0.072917,0.013655,0.009739,0.124498,0.970913,0.680838,0WylixnDwAtnmK8CxOuldn
1,1,0,0,0,0,1,1,0,0,0,...,0.842105,0.870871,0.901906,0.173958,0.268072,0.0,0.413655,0.567703,0.626409,1ARJhjuI6TNYZCxYygFQ4F
2,0,1,0,0,0,1,1,0,0,0,...,0.642713,0.808809,0.873442,0.045625,0.006677,0.0,0.134538,0.671013,0.658278,41zw0MQZTYTdS6QhutwA3v
3,0,1,0,0,1,0,1,0,0,0,...,0.589069,0.72973,0.838207,0.645833,0.315261,0.0,0.419679,0.555667,0.364018,3VzZRoYHpQiU8AJZ0Lz4ME
4,0,1,0,0,1,0,1,0,0,0,...,0.867409,0.781782,0.901001,0.108333,0.130522,0.0,0.403614,0.837513,0.554038,6CaMlPv0MvfRsHE1cQAHEZ


In [24]:
# Generate Recommendations
def generate_recommendation(df, features, non_playlist):
    '''
    Generates Recommendation Based on a Given Playlist
    ---
    Input:
    df (pandas DataFrame): All songs DataFrame
    features (pandas DataFrame): Playlist Feature Vector
    non_playlist (pandas DataFrame): Song Features not in the Playlist
    
    Output:
    non_playlist_top_40 (pandas DataFrame): Top 40 Recommendations
    '''
    
    non_playlist_data = df[df['id'].isin(non_playlist['id'].values)]
    
    # Find Cosine Similarity
    non_playlist_data['sim'] = cosine_similarity(non_playlist.drop(['id'], axis = 'columns').values,
                                                [features.values.reshape(-1, 1)[:,0]])
    # Select Top 40
    non_playlist_top_40 = non_playlist_data.sort_values('sim',ascending = False).head(40)
    
    return non_playlist_top_40[['artist_name', 'track_name', 'track_uri']]

In [25]:
generate_recommendation(song_df, playlist_vector, non_playlist_vector)

KeyError: "['track_uri'] not in index"

In [None]:
test_playlist[['artist_name', 'track_name', 'track_uri']]