# GA Capstone - Sortify

##### Aziz Maredia | DSIR-1019 | 01.27.21

### Problem Statement

I am going to build a music recommender system using the Spotify and Genius APIs. The app will sort an ablum entered in the order of songs the user will mostly likely like by analyzing the audio features and lyrics of the users recently saved tracks, recently played tracks, and top tracks.

### Library/Packages Import

In [1]:
import pandas as pd
import numpy as np
from math import floor

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from PIL import Image
from requests import get

from flask import Flask, render_template, redirect, request, session, make_response,session,redirect

pd.set_option('display.max_rows', None)

### API User Authorization

In [2]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-library-read user-library-modify'

In [3]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

In [4]:
user = sp.current_user()
user_name = user['display_name']
user_profile = user['external_urls']['spotify']

In [5]:
name = user['display_name'].split()[0]

In [6]:
print(f'Hello {user_name.split()[0]}! Click the following link to view your Spoitfy profile: {user_profile}')

Hello Aziz! Click the following link to view your Spoitfy profile: https://open.spotify.com/user/1282060363


### Collect All User Saved Tracks

In [7]:
def get_current_saved(var = None):
    
    # set empty variable for dataframe to be assigned to
    df = None
    
    # set variable to check number of rows in dataframe
    num_rows = 0
    
    # api can pull max of 20 tracks at a time and a spotify user can have a max of 10,000 songs saved
    for i in range(0, 10000, 20):
        
        # call api endpoint to collect saved tracks. offset equals index of first song being pull and increases by 20 each interation
        results = sp.current_user_saved_tracks(offset = i)
        
        # set empty dict for track variables and values collected to be stored in
        current_saved_dict = {}
        current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
        current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
        current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
        
        # if length of dataframe index equals the num_rows variable set above, this means that we have pulled all songs in a user's library
        # if not, update num_rows to current index length
        if len(df.index) == num_rows:
            return df
        else:
            num_rows = len(df.index)
    
    return df

### Collect Audio Features of Tracks

In [8]:
def get_audio_features(all_tracks_df):
     
    # setting empty variable for dataframe to be assigned to   
    df = None
    
    # setting start and end index of tracks pulled for each interation
    start = 0
    end = 100
    
    # api can only pull audio features for 100 tracks max
    # calculting how many pulls will be need based on number of tracks in user's saved library
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
        
        # setting rules for start an end index of tracks which need audio features pulled
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])
        
        # don't need all information. specificing which key/values needed from results
        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'duration_ms', 'time_signature']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

### Collect Track Artist Information

In [9]:
def get_genres(all_tracks_df):
        
    # setting empty variable for dataframe to be assigned to   
    df = None
    
    # setting start and end index of tracks pulled for each interation
    start = 0
    end = 50
    
    # getting unique artist names so I'm not pulling the same information twice
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    # calculting number of pulls needed. artist endpoint only gives info on 50 artist at a time
    for i in range((len(artists) // 50) + 1):
    
        # setting rules for start an end index of artists which need info pulled for
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        # don't need all information. specificing which key/values needed from results
        audio_dict = {}
        keep_features = ['id', 'popularity', 'genres']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
    df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

### Collect All Tracks & Information Calling Functions Above

In [10]:
def get_user_tracks_w_audio_features(var = None):
    
    # collect user tracks
    all_tracks_df = get_current_saved()
    
    # collect audio features for user tracks
    audio_features_df = get_audio_features(all_tracks_df)
    
    # merge user tracks with audio features
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    # collect artist/genre information for user tracks
    genres_df = get_genres(all_tracks_df)
    
    # merge user tracks with artsit/genre information
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    # add column with values to specificy these are the user's saved tracks
    df['data_type'] = 'user_library'
    
    # reorder columns
    df = df[['track', 'artist', 'track_id', 'data_type','popularity_artist', 'popularity_song',
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]
    
    return df

In [11]:
%%time
user_tracks_df = get_user_tracks_w_audio_features()

CPU times: user 887 ms, sys: 78.8 ms, total: 966 ms
Wall time: 25.7 s


In [12]:
user_tracks_df.shape

(1524, 20)

In [13]:
user_tracks_df.head()

Unnamed: 0,track,artist,track_id,data_type,popularity_artist,popularity_song,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode,key,time_signature,release_date,genres
0,JustYourSoul - Tchami Remix,Valentino Khan,5NJi42d7VDo8KPVksb7dS8,user_library,64,48.0,0.749,0.846,-3.441,0.0932,0.00403,0.00525,0.0382,0.532,124.935,1,2,4,2020-01-10,"[bass house, brostep, edm, electro house, elec..."
1,Moving Men,Myd,3y4I9VECfNbDXYN2bXh9hV,user_library,58,59.0,0.815,0.661,-8.208,0.0996,0.106,0.266,0.284,0.699,124.991,1,10,3,2020-10-28,"[filter house, french indie pop, french indiet..."
2,Sonate Pacifique,L'Impératrice,49PyCCLOdJi0jHkGyyY2vv,user_library,63,62.0,0.558,0.538,-8.853,0.0327,0.629,0.0198,0.424,0.254,99.992,1,5,4,2014-09-22,"[french indie pop, french indietronica, french..."
3,Casio,Jungle,44ZKnfWEkp7wPs035j4Tua,user_library,66,66.0,0.816,0.492,-7.464,0.0752,0.431,0.0896,0.0995,0.765,115.989,0,10,4,2018-09-14,"[indie soul, uk contemporary r&b]"
4,"Music Is My Hot, Hot Sex",CSS,5hzBRLhHC29XTml7gEEK6o,user_library,46,0.0,0.742,0.723,-5.62,0.0441,0.0137,0.00127,0.061,0.927,100.016,0,8,4,2007-01-22,"[alternative dance, dance-punk, electroclash, ..."


### Collect Album User Would Like to Sort

In [14]:
def get_album_tracks(album_to_search):
    
    # search for item and return album id
    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']

    df = None
    
    # use album id on another endpoint to collect album and track information
    results2 = sp.album(album_id)

    # creating empty dict to store values needed below then converting it to a dataframe
    album_tracks_dict = {}

    album_tracks_dict['track'] = [i['name'] for i in results2['tracks']['items']]
    album_tracks_dict['track_id'] = [i['id'] for i in results2['tracks']['items']]
    album_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['tracks']['items']]
    album_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['tracks']['items']]
    album_tracks_dict['preview'] = [i['preview_url'] for i in results2['tracks']['items']]
    
    df = pd.DataFrame(album_tracks_dict)
    
    # creating dictionary to store album info which will be used in the web application
    album_dict = {}
    
    album_dict['album_name'] = results2['name'] # album title
    album_dict['album_image'] = results2['images'][1]['url'] # album coveer
    album_dict['artist_name'] = results2['artists'][0]['name'] # artist name
    album_dict['release_year'] = results2['release_date'][:4] # release_year
    album_dict['total_tracks'] = results2['total_tracks'] # total tracks
    album_dict['album_link'] = results2['external_urls']['spotify']
    album_dict['artist_link'] = results2['artists'][0]['external_urls']['spotify']
    
    # converting total album duration into a special format for web application
    minutes = sum([i['duration_ms'] for i in results2['tracks']['items']]) / 60000
    
    if minutes > 60:
        album_dict['album_duration'] = f'{int(minutes // 60)} hr {floor(minutes % 60)} min'
    else:
        album_dict['album_duration'] = f'{floor(minutes)} min {floor((minutes - floor(minutes)) * 60)} sec'
    
    df['release_date'] = results2['release_date']
    
    # collecting popularity from tracks endpoint because info not available through album endpoints album
    results3 = sp.tracks(tracks = list(df['track_id']))
    df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]

    # collect audio features for album tracks
    df_audio = get_audio_features(df)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    # collect artist/genre info for album tracks
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'album'
    
    # creating album df in specific format for final model results
    album_final = df[['track', 'track_id', 'duration_ms', 'spotify_link', 'preview']]
    
    # creating another album df which be used with the user tracks in the actual modeling process
    df = df[['track', 'track_id', 'artist', 'data_type', 'popularity_artist', 'popularity_song',
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]
                                 
    return df, album_final, album_dict

In [15]:
%%time
album_tracks_df, album_final_model_df, album_info_dict = get_album_tracks('Man on the Moon III: The Chosen')

CPU times: user 44.2 ms, sys: 4.36 ms, total: 48.5 ms
Wall time: 391 ms


In [16]:
album_info_dict

{'album_name': 'Man On The Moon III: The Chosen',
 'album_image': 'https://i.scdn.co/image/ab67616d00001e026f43a625b608f7177caa18c9',
 'artist_name': 'Kid Cudi',
 'release_year': '2020',
 'total_tracks': 18,
 'album_link': 'https://open.spotify.com/album/64nbgEEIcY4g1ElVLONJ0w',
 'artist_link': 'https://open.spotify.com/artist/0fA0VVWsXO9YnASrzqfmYu',
 'album_duration': '58 min 24 sec'}

In [17]:
album_info_dict['album_image']

'https://i.scdn.co/image/ab67616d00001e026f43a625b608f7177caa18c9'

In [18]:
album_tracks_df.shape

(18, 20)

In [19]:
album_tracks_df.head()

Unnamed: 0,track,track_id,artist,data_type,popularity_artist,popularity_song,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode,key,time_signature,release_date,genres
0,Beautiful Trip,4IIuCotvqijraSdnVLaFnM,Kid Cudi,album,89,69,0.331,0.513,-15.392,0.632,0.972,0.953,0.882,0.42,133.971,0,11,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
1,Tequila Shots,30KctD1WsHKTIYczXjip5a,Kid Cudi,album,89,83,0.712,0.556,-7.214,0.0531,0.084,5e-05,0.527,0.22,90.494,0,5,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
2,Another Day,6myUpr3GDR80Dg3zqNTmmG,Kid Cudi,album,89,76,0.646,0.758,-7.75,0.0708,0.556,0.00316,0.335,0.0642,172.995,0,4,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
3,She Knows This,1xzUQMiCoY5pdego0pHMeV,Kid Cudi,album,89,78,0.39,0.724,-7.607,0.0889,0.162,0.0,0.393,0.312,165.945,0,6,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
4,Dive,7Hc3YL8oDiAzbiAW32KXrw,Kid Cudi,album,89,73,0.64,0.621,-8.269,0.0679,0.67,0.0,0.36,0.332,123.105,0,9,4,2020-12-11,"[hip hop, ohio hip hop, rap]"


In [20]:
album_final_model_df

Unnamed: 0,track,track_id,duration_ms,spotify_link,preview
0,Beautiful Trip,4IIuCotvqijraSdnVLaFnM,37013,https://open.spotify.com/track/4IIuCotvqijraSd...,https://p.scdn.co/mp3-preview/05edcc06fc085a58...
1,Tequila Shots,30KctD1WsHKTIYczXjip5a,193293,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...
2,Another Day,6myUpr3GDR80Dg3zqNTmmG,199787,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...
3,She Knows This,1xzUQMiCoY5pdego0pHMeV,216560,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...
4,Dive,7Hc3YL8oDiAzbiAW32KXrw,148707,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...
5,Damaged,2n7Ao4nyESBa5ti8gcAbBt,150853,https://open.spotify.com/track/2n7Ao4nyESBa5ti...,https://p.scdn.co/mp3-preview/393f2cf8af310e83...
6,Heaven On Earth,2koUj1Fn5TKFEkChSmBPIb,201093,https://open.spotify.com/track/2koUj1Fn5TKFEkC...,https://p.scdn.co/mp3-preview/2835c7cd7c80c80c...
7,Show Out (with Skepta & Pop Smoke),5CFJRZRq6sdKKtRwNPWbYv,174960,https://open.spotify.com/track/5CFJRZRq6sdKKtR...,https://p.scdn.co/mp3-preview/7b310f6822a15aea...
8,"Solo Dolo, Pt. III",27oVCAziETRbNuo5A8LNpg,242267,https://open.spotify.com/track/27oVCAziETRbNuo...,https://p.scdn.co/mp3-preview/c461629a7f1acf31...
9,Sad People,4nuAslShoN77tq12fzwjUq,176027,https://open.spotify.com/track/4nuAslShoN77tq1...,https://p.scdn.co/mp3-preview/a2e8a2ffade8b892...


### Clean Genres Column

XXX

In [21]:
def clean_genres(df):
    
    genres = ['folk', 'gothic', 'emo', 'metal', 'rock', 'punk', 'alternative', 'grunge', 'pop', 'hip hop',
              'country', 'bluegrass', 'swing', 'blues', 'jazz', 'gospel', 'soul', 'piano', 'rythm', 'reggae',
              'rap', 'r&b', 'edm', 'dupstep', 'techno', 'house', 'trance', 'electro', 'dance', 'disco',
              'classical', 'singer songwriter', 'musical', 'african', 'hawaiian', 'jam band', 'psychedelic']
    
    
    
    languages = {'asian' : ['chinese', 'japanese', 'korean', 'korean pop', 'taiwanese', 'vietnamese', 'malaysian', 'indonesian', 'thai', 'tibetan'],
                 'baltic_slavic' : ['croatian', 'czech', 'latvian', 'polish', 'serbian', 'russian', 'lithuanian', 'ukrainian', 'slovenian', 'bulgarian'],
                 'celtic' : ['irish', 'scottish', 'celtic'],
                 'english' : ['australian', 'uk', 'british', 'canadian'],
                 'germanic' : ['german', 'norwegian', 'swedish', 'dutch', 'icelandic', 'austrian', 'danish', 'belgian'],
                 'indian_pakistani' : ['indian', 'pakistani', 'punjabi', 'hindustani'],
                 'middle_eastern' : ['israeli', 'kurdish', 'hebrew', 'arab', 'turkish'],
                 'romance' : ['spanish','french', 'italian', 'romanian', 'latin', 'portuguese'],
                 'south_american' : ['brazilian', 'venezuelan', 'argentine', 'peruvian', 'chilean'],
                 'uralic' : ['finnish', 'estonian', 'hungarian']}
    
    genres_list = []

    for i in df['genres']:

        temp = []

        for g in genres:
            if g in ' '.join(i):
                temp.append(g)

        for group in languages.keys():
            for l in languages[group]:
                if l in ' '.join(i):
                    temp.append(group)

        if len(temp) == 0:
            temp.append('other')

        genres_list.append(temp)
    
    df['genres'] = genres_list
    
    df = pd.concat([df, pd.get_dummies(df['genres'].apply(pd.Series).stack()).sum(level = 0)], axis = 1).drop(columns = ['genres'])
    
    return df

In [22]:
combined_tracks_df = pd.concat([user_tracks_df, album_tracks_df], axis = 0)
combined_tracks_df.reset_index(drop = True, inplace = True)

In [23]:
combined_tracks_df = clean_genres(combined_tracks_df)

In [24]:
combined_tracks_df.shape

(1542, 51)

### Exporting/Saving Dataframes

In [25]:
album_final_model_df.to_csv('.././datasets/album_to_sort.csv', index = False)
combined_tracks_df.to_csv('.././datasets/combined_tracks.csv', index = False)