# Sortify - GA Capstone

##### Aziz Maredia | DSIR-1019 | 01.27.21

### Problem Statement

I am going to build a music recommender system using the Spotify and Genius APIs. The app will sort an ablum entered in the order of songs the user will mostly likely like by analyzing the audio features and lyrics of the users recently saved tracks, recently played tracks, and top tracks.

### Library/Packages Import

In [1]:
import pandas as pd
import numpy as np
from math import floor

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from PIL import Image
from requests import get

from flask import Flask, render_template, redirect, request, session, make_response,session,redirect

pd.set_option('display.max_rows', None)

### API User Authorization

In [2]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-library-read user-library-modify'

In [3]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

In [4]:
user = sp.current_user()
user_name = user['display_name']
user_profile = user['external_urls']['spotify']

In [5]:
name = user['display_name'].split()[0]

In [6]:
print(f'Hello {user_name.split()[0]}! Click the following link to view your Spoitfy profile: {user_profile}')

Hello Aziz! Click the following link to view your Spoitfy profile: https://open.spotify.com/user/1282060363


### Collect All User Saved Tracks

In [7]:
def get_current_saved(var = None):
    
    # set empty variable for dataframe to be assigned to
    df = None
    
    # set variable to check number of rows in dataframe
    num_rows = 0
    
    # api can pull max of 20 tracks at a time and a spotify user can have a max of 10,000 songs saved
    for i in range(0, 10000, 20):
        
        # call api endpoint to collect saved tracks. offset equals index of first song being pull and increases by 20 each interation
        results = sp.current_user_saved_tracks(offset = i)
        
        # set empty dict for track variables and values collected to be stored in
        current_saved_dict = {}
        current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
        current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
        current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
        
        # if length of dataframe index equals the num_rows variable set above, this means that we have pulled all songs in a user's library
        # if not, update num_rows to current index length
        if len(df.index) == num_rows:
            return df
        else:
            num_rows = len(df.index)
    
    return df

### Collect Audio Features of Tracks

In [8]:
def get_audio_features(all_tracks_df):
        
    df = None
    start = 0
    end = 100
    
    # api can only pull audio features for 100 tracks max
    # calculting how many pulls will be need based on number of tracks in user's saved library
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
        
        # XXX
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])
        
        # XXX
        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'duration_ms', 'time_signature']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

### Collect Track Artist Information

In [9]:
def get_genres(all_tracks_df):
        
    df = None
    start = 0
    end = 50
    
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    
    for i in range((len(artists) // 50) + 1):
    
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        audio_dict = {}
        keep_features = ['id', 'popularity', 'genres']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
    df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

### Collect All Tracks & Information Calling Functions Above

In [10]:
def get_user_tracks_w_audio_features(var = None):
    
    all_tracks_df = get_current_saved()
    
    audio_features_df = get_audio_features(all_tracks_df)
    
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    genres_df = get_genres(all_tracks_df)
    
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'user_library'
    
    df = df[['track', 'artist', 'data_type', 'popularity_artist', 'popularity_song', 
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]
    
    return df

In [11]:
%%time
user_tracks_df = get_user_tracks_w_audio_features()

CPU times: user 912 ms, sys: 83.3 ms, total: 995 ms
Wall time: 17.2 s


In [12]:
user_tracks_df.shape

(1524, 19)

In [13]:
user_tracks_df.head()

Unnamed: 0,track,artist,data_type,popularity_artist,popularity_song,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode,key,time_signature,release_date,genres
0,JustYourSoul - Tchami Remix,Valentino Khan,user_library,64,48.0,0.749,0.846,-3.441,0.0932,0.00403,0.00525,0.0382,0.532,124.935,1,2,4,2020-01-10,"[bass house, brostep, edm, electro house, elec..."
1,Moving Men,Myd,user_library,57,60.0,0.815,0.661,-8.208,0.0996,0.106,0.266,0.284,0.699,124.991,1,10,3,2020-10-28,"[filter house, french indie pop, french indiet..."
2,Sonate Pacifique,L'Impératrice,user_library,63,63.0,0.558,0.538,-8.853,0.0327,0.629,0.0198,0.424,0.254,99.992,1,5,4,2014-09-22,"[french indie pop, french indietronica, french..."
3,Casio,Jungle,user_library,66,67.0,0.816,0.492,-7.464,0.0752,0.431,0.0896,0.0995,0.765,115.989,0,10,4,2018-09-14,"[indie soul, uk contemporary r&b]"
4,"Music Is My Hot, Hot Sex",CSS,user_library,46,0.0,0.742,0.723,-5.62,0.0441,0.0137,0.00127,0.061,0.927,100.016,0,8,4,2007-01-22,"[alternative dance, dance-punk, electroclash, ..."


### Collect Album User Would Like to Sort

In [14]:
def get_album_tracks(album_to_search):

    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']

    df = None
    
    results2 = sp.album(album_id)

    album_tracks_dict = {}

    album_tracks_dict['track'] = [i['name'] for i in results2['tracks']['items']]
    album_tracks_dict['track_id'] = [i['id'] for i in results2['tracks']['items']]
    album_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['tracks']['items']]
    album_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['tracks']['items']]
    album_tracks_dict['preview'] = [i['preview_url'] for i in results2['tracks']['items']]
    
    df = pd.DataFrame(album_tracks_dict)
    album_dict = {}
    
    album_dict['album_name'] = results2['name'] # album title
    album_dict['artist_name'] = results2['artists'][0]['name'] # artist name
    album_dict['release_year'] = results2['release_date'][:4] # release_year
    album_dict['total_tracks'] = results2['total_tracks'] # total tracks
    album_dict['album_link'] = results2['external_urls']['spotify']
    album_dict['artist_link'] = results2['artists'][0]['external_urls']['spotify']
    
    album_image = Image.open(get(results2['images'][1]['url'], stream = True).raw)
    album_dict['album_image'] = album_image.resize((300, 300))
    
    minutes = sum([i['duration_ms'] for i in results2['tracks']['items']]) / 60000
    
    if minutes > 60:
        album_dict['album_duration'] = f'{int(minutes // 60)} hr {floor(minutes % 60)} min'
    else:
        album_dict['album_duration'] = f'{floor(minutes)} min {floor((minutes - floor(minutes)) * 60)} sec'
    
#     df['album'] = album_dict['album_name']
#     df['album_image_large'] = album_dict['album_image']
    df['release_date'] = results2['release_date']
    

    results3 = sp.tracks(tracks = list(df['track_id']))
    
    df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]

    df_audio = get_audio_features(df)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'album'
    
    df = df[['track', 'artist', 'track_id', 'spotify_link', 'preview', 'data_type', 'duration_ms', 'popularity_artist', 
             'popularity_song', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
             'instrumentalness', 'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]
                                 
    return df, album_dict

In [15]:
%%time
album_tracks_df, album_info_dict = get_album_tracks('Man on the Moon III: The Chosen')

CPU times: user 80.1 ms, sys: 14.9 ms, total: 95 ms
Wall time: 736 ms


In [16]:
album_info_dict

{'album_name': 'Man On The Moon III: The Chosen',
 'artist_name': 'Kid Cudi',
 'release_year': '2020',
 'total_tracks': 18,
 'album_link': 'https://open.spotify.com/album/64nbgEEIcY4g1ElVLONJ0w',
 'artist_link': 'https://open.spotify.com/artist/0fA0VVWsXO9YnASrzqfmYu',
 'album_image': <PIL.Image.Image image mode=RGB size=300x300 at 0x7FC48EC76550>,
 'album_duration': '58 min 24 sec'}

In [17]:
album_tracks_df.shape

(18, 23)

In [18]:
album_tracks_df.head(50)

Unnamed: 0,track,artist,track_id,spotify_link,preview,data_type,duration_ms,popularity_artist,popularity_song,danceability,...,acousticness,instrumentalness,liveness,valence,tempo,mode,key,time_signature,release_date,genres
0,Beautiful Trip,Kid Cudi,4IIuCotvqijraSdnVLaFnM,https://open.spotify.com/track/4IIuCotvqijraSd...,https://p.scdn.co/mp3-preview/05edcc06fc085a58...,album,37013,90,70,0.331,...,0.972,0.953,0.882,0.42,133.971,0,11,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
1,Tequila Shots,Kid Cudi,30KctD1WsHKTIYczXjip5a,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...,album,193293,90,84,0.712,...,0.084,5e-05,0.527,0.22,90.494,0,5,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
2,Another Day,Kid Cudi,6myUpr3GDR80Dg3zqNTmmG,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...,album,199787,90,77,0.646,...,0.556,0.00316,0.335,0.0642,172.995,0,4,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
3,She Knows This,Kid Cudi,1xzUQMiCoY5pdego0pHMeV,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...,album,216560,90,79,0.39,...,0.162,0.0,0.393,0.312,165.945,0,6,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
4,Dive,Kid Cudi,7Hc3YL8oDiAzbiAW32KXrw,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...,album,148707,90,75,0.64,...,0.67,0.0,0.36,0.332,123.105,0,9,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
5,Damaged,Kid Cudi,2n7Ao4nyESBa5ti8gcAbBt,https://open.spotify.com/track/2n7Ao4nyESBa5ti...,https://p.scdn.co/mp3-preview/393f2cf8af310e83...,album,150853,90,73,0.625,...,0.553,0.00111,0.254,0.267,142.04,1,8,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
6,Heaven On Earth,Kid Cudi,2koUj1Fn5TKFEkChSmBPIb,https://open.spotify.com/track/2koUj1Fn5TKFEkC...,https://p.scdn.co/mp3-preview/2835c7cd7c80c80c...,album,201093,90,74,0.786,...,0.379,0.00082,0.0765,0.228,130.036,1,0,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
7,Show Out (with Skepta & Pop Smoke),Kid Cudi,5CFJRZRq6sdKKtRwNPWbYv,https://open.spotify.com/track/5CFJRZRq6sdKKtR...,https://p.scdn.co/mp3-preview/7b310f6822a15aea...,album,174960,90,84,0.619,...,0.43,0.000392,0.378,0.158,143.896,1,1,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
8,"Solo Dolo, Pt. III",Kid Cudi,27oVCAziETRbNuo5A8LNpg,https://open.spotify.com/track/27oVCAziETRbNuo...,https://p.scdn.co/mp3-preview/c461629a7f1acf31...,album,242267,90,75,0.589,...,0.239,0.0277,0.164,0.52,152.058,0,10,4,2020-12-11,"[hip hop, ohio hip hop, rap]"
9,Sad People,Kid Cudi,4nuAslShoN77tq12fzwjUq,https://open.spotify.com/track/4nuAslShoN77tq1...,https://p.scdn.co/mp3-preview/a2e8a2ffade8b892...,album,176027,90,75,0.636,...,0.247,0.00123,0.113,0.416,158.073,0,5,4,2020-12-11,"[hip hop, ohio hip hop, rap]"


In [19]:
album_final_model_df = album_tracks_df[['track', 'duration_ms', 'track_id', 'spotify_link', 'preview']]

In [20]:
album_final_model_df.to_csv('.././datasets/album_to_sort.csv', index = False)

In [21]:
album_final_model_df

Unnamed: 0,track,duration_ms,track_id,spotify_link,preview
0,Beautiful Trip,37013,4IIuCotvqijraSdnVLaFnM,https://open.spotify.com/track/4IIuCotvqijraSd...,https://p.scdn.co/mp3-preview/05edcc06fc085a58...
1,Tequila Shots,193293,30KctD1WsHKTIYczXjip5a,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...
2,Another Day,199787,6myUpr3GDR80Dg3zqNTmmG,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...
3,She Knows This,216560,1xzUQMiCoY5pdego0pHMeV,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...
4,Dive,148707,7Hc3YL8oDiAzbiAW32KXrw,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...
5,Damaged,150853,2n7Ao4nyESBa5ti8gcAbBt,https://open.spotify.com/track/2n7Ao4nyESBa5ti...,https://p.scdn.co/mp3-preview/393f2cf8af310e83...
6,Heaven On Earth,201093,2koUj1Fn5TKFEkChSmBPIb,https://open.spotify.com/track/2koUj1Fn5TKFEkC...,https://p.scdn.co/mp3-preview/2835c7cd7c80c80c...
7,Show Out (with Skepta & Pop Smoke),174960,5CFJRZRq6sdKKtRwNPWbYv,https://open.spotify.com/track/5CFJRZRq6sdKKtR...,https://p.scdn.co/mp3-preview/7b310f6822a15aea...
8,"Solo Dolo, Pt. III",242267,27oVCAziETRbNuo5A8LNpg,https://open.spotify.com/track/27oVCAziETRbNuo...,https://p.scdn.co/mp3-preview/c461629a7f1acf31...
9,Sad People,176027,4nuAslShoN77tq12fzwjUq,https://open.spotify.com/track/4nuAslShoN77tq1...,https://p.scdn.co/mp3-preview/a2e8a2ffade8b892...


### Clean Genres Column

In [22]:
def clean_genres(df):
    
    genres = ['folk', 'gothic', 'emo', 'metal', 'rock', 'punk', 'alternative', 'grunge', 'pop', 'hip hop',
              'country', 'bluegrass', 'swing', 'blues', 'jazz', 'gospel', 'soul', 'piano', 'rythm', 'reggae',
              'rap', 'r&b', 'edm', 'dupstep', 'techno', 'house', 'trance', 'electro', 'dance', 'disco',
              'classical', 'singer songwriter', 'musical', 'african', 'hawaiian', 'jam band', 'psychedelic']
    
    languages = {'asian' : ['chinese', 'japanese', 'korean', 'korean pop', 'taiwanese', 'vietnamese', 'malaysian', 'indonesian', 'thai', 'tibetan'],
                 'baltic_slavic' : ['croatian', 'czech', 'latvian', 'polish', 'serbian', 'russian', 'lithuanian', 'ukrainian', 'slovenian', 'bulgarian'],
                 'celtic' : ['irish', 'scottish', 'celtic'],
                 'english' : ['australian', 'uk', 'british', 'canadian'],
                 'germanic' : ['german', 'norwegian', 'swedish', 'dutch', 'icelandic', 'austrian', 'danish', 'belgian'],
                 'indian_pakistani' : ['indian', 'pakistani', 'punjabi', 'hindustani'],
                 'middle_eastern' : ['israeli', 'kurdish', 'hebrew', 'arab', 'turkish'],
                 'romance' : ['spanish','french', 'italian', 'romanian', 'latin', 'portuguese'],
                 'south_american' : ['brazilian', 'venezuelan', 'argentine', 'peruvian', 'chilean'],
                 'uralic' : ['finnish', 'estonian', 'hungarian']}
    
    genres_list = []

    for i in df['genres']:

        temp = []

        for g in genres:
            if g in ' '.join(i):
                temp.append(g)

        for group in languages.keys():
            for l in languages[group]:
                if l in ' '.join(i):
                    temp.append(group)

        if len(temp) == 0:
            temp.append('other')

        genres_list.append(temp)
    
    df['genres'] = genres_list
    
    df = pd.concat([df, pd.get_dummies(df['genres'].apply(pd.Series).stack()).sum(level = 0)], axis = 1).drop(columns = ['genres'])
    
    return df

In [23]:
user_tracks_df.columns

Index(['track', 'artist', 'data_type', 'popularity_artist', 'popularity_song',
       'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'mode', 'key',
       'time_signature', 'release_date', 'genres'],
      dtype='object')

In [24]:
album_concat = album_tracks_df[['track', 'artist', 'data_type', 'popularity_artist', 'popularity_song', 
                 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
                'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]

In [25]:
combined_tracks_df = pd.concat([user_tracks_df, album_concat], axis = 0)
combined_tracks_df.reset_index(drop = True, inplace = True)

In [26]:
combined_tracks_df = clean_genres(combined_tracks_df)

In [27]:
combined_tracks_df.shape

(1542, 50)

In [28]:
combined_tracks_df.to_csv('.././datasets/combined_tracks.csv', index = False)