# Sortify - GA Capstone

##### Aziz Maredia | DSIR-1019 | 01.27.21

### Problem Statement

I am going to build a music recommender system using the Spotify and Genius APIs. The app will sort an ablum entered in the order of songs the user will mostly likely like by analyzing the audio features and lyrics of the users recently saved tracks, recently played tracks, and top tracks.

### Library/Packages Import

In [135]:
import pandas as pd
import numpy as np
from math import floor, ceil

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from time import strftime, gmtime

from PIL import Image
import requests

pd.set_option('display.max_rows', None)

### API User Authorization

In [136]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-read-recently-played user-library-read user-top-read'

# 'user-read-recently-played' --> Get Current User's Recently Played Tracks
# 'user-top-read' --> Get a User's Top Artists and Tracks
# 'user-library-read' --> Get a User's Saved Tracks

In [137]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

In [138]:
user = sp.current_user()
user_name = user['display_name']
user_profile = user['external_urls']['spotify']

In [139]:
print(f'Hello {user_name.split()[0]}! Click the following link to view your Spoitfy profile: {user_profile}')

Hello Aziz! Click the following link to view your Spoitfy profile: https://open.spotify.com/user/1282060363


### User Top Tracks

In [140]:
def get_top_tracks(var = None):
 
    df = None
    
    for i in range(0, 60, 20):
        
        results = sp.current_user_top_tracks(offset = i)

        top_tracks_dict = {}

        top_tracks_dict['track'] = [i['name'] for i in results['items']]
        top_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results['items']]
        top_tracks_dict['album'] = [i['album']['name'] for i in results['items']]
        top_tracks_dict['track_id'] = [i['id'] for i in results['items']]
        top_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results['items']]
        top_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results['items']]
        top_tracks_dict['preview'] = [i['preview_url'] for i in results['items']]
        top_tracks_dict['album_image_large'] = [i['album']['images'][1]['url'] for i in results['items']]
        top_tracks_dict['album_image_small'] = [i['album']['images'][2]['url'] for i in results['items']]
        top_tracks_dict['release_date'] = [i['album']['release_date'] for i in results['items']]
        top_tracks_dict['popularity_song'] = [i['popularity'] for i in results['items']]
       
        if i == 0:
            df = pd.DataFrame(top_tracks_dict)
        
        else:
            df = pd.concat([df, pd.DataFrame(top_tracks_dict)])
            df.reset_index(drop = True, inplace = True)
    
    return df

### User Current Saved

In [141]:
def get_current_saved(var = None):
    
    df = None
    num_rows = 0
    
    for i in range(0, 10000, 20):
    
        results = sp.current_user_saved_tracks(offset = i)
        current_saved_dict = {}

        current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
        current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
#         current_saved_dict['album'] = [i['track']['album']['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
#         current_saved_dict['spotify_link'] = [i['track']['external_urls']['spotify'] for i in results['items']]
#         current_saved_dict['preview'] = [i['track']['preview_url'] for i in results['items']]
#         current_saved_dict['album_image_large'] = [i['track']['album']['images'][1]['url'] for i in results['items']]
#         current_saved_dict['album_image_small'] = [i['track']['album']['images'][2]['url'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
        current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
        
        if len(df.index) == num_rows:
            return df
        else:
            num_rows = len(df.index)
    
    return df

### Get Tracks w/ Audio Features

In [142]:
def get_audio_features(all_tracks_df):
        
    df = None
    start = 0
    end = 100
    
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
    
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])

        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'duration_ms', 'time_signature'] # duration_ms removed

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

### Get Track Genres

In [143]:
def get_genres(all_tracks_df):
        
    df = None
    start = 0
    end = 50
    
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    
    for i in range((len(artists) // 50) + 1):
    
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        audio_dict = {}
        keep_features = ['id', 'popularity', 'genres']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
    df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

### Get Tracks

In [189]:
def get_user_tracks_w_audio_features(var = None):
    
#     all_tracks_df = pd.concat([get_top_tracks(), get_current_saved()], axis = 0)
#     all_tracks_df.drop_duplicates(inplace = True)
#     all_tracks_df.reset_index(drop = True, inplace = True)
    
    all_tracks_df = get_current_saved()
    
    audio_features_df = get_audio_features(all_tracks_df)
    
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    genres_df = get_genres(all_tracks_df)
    
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'user_library'
    
    df = df[['track', 'artist', 'track_id', 'artist_id',
             'data_type', 'duration_ms', 'popularity_artist', 'popularity_song', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 
             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 'time_signature', 'release_date', 'genres']] # 'album', spotify_link', 'preview', 'album_image_large', 'album_image_small'
    
    return df

In [190]:
%%time
user_tracks_df = get_user_tracks_w_audio_features()

CPU times: user 936 ms, sys: 75.8 ms, total: 1.01 s
Wall time: 15.5 s


In [191]:
user_tracks_df.shape

(1542, 22)

In [192]:
user_tracks_df.head()

Unnamed: 0,track,artist,track_id,artist_id,data_type,duration_ms,popularity_artist,popularity_song,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,release_date,genres
0,Moving Men,Myd,3y4I9VECfNbDXYN2bXh9hV,3QFiymmbJlVBPpnrOatEAk,user_library,167787,56,62.0,0.815,0.661,...,0.0996,0.106,0.266,0.284,0.699,124.991,10,3,2020-10-28,"[filter house, french indie pop, french indiet..."
1,Sonate Pacifique,L'Impératrice,49PyCCLOdJi0jHkGyyY2vv,4PwlsrN0t5mLN0C827cbEU,user_library,348945,62,65.0,0.558,0.538,...,0.0327,0.629,0.0198,0.424,0.254,99.992,5,4,2014-09-22,"[french indie pop, french indietronica, french..."
2,Casio,Jungle,44ZKnfWEkp7wPs035j4Tua,59oA5WbbQvomJz2BuRG071,user_library,234370,65,69.0,0.816,0.492,...,0.0752,0.431,0.0896,0.0995,0.765,115.989,10,4,2018-09-14,"[indie soul, uk contemporary r&b]"
3,"Music Is My Hot, Hot Sex",CSS,5hzBRLhHC29XTml7gEEK6o,2K13AVg3bFpHSxDM1vJ0qA,user_library,187027,45,0.0,0.742,0.723,...,0.0441,0.0137,0.00127,0.061,0.927,100.016,8,4,2007-01-22,"[alternative dance, dance-punk, electroclash, ..."
4,Nanã,Polo & Pan,0Psz3az3RIYfJpnsajBT8N,45yEuthJ9yq1rNXAOpBnqM,user_library,190827,67,70.0,0.685,0.735,...,0.0334,0.191,0.038,0.188,0.385,94.035,4,3,2017-05-19,"[dark disco, electro-pop francais, french indi..."


In [193]:
genres = ['folk', 'gothic', 'emo', 'metal', 'rock', 'punk', 'alternative', 'grunge', 'pop', 'hip hop',
'country', 'bluegrass', 'swing', 'blues', 'jazz', 'gospel', 'soul', 'piano', 'rythm', 'reggae',
'rap', 'r&b', 'edm', 'dupstep', 'techno', 'house', 'trance', 'electro', 'dance', 'disco',
'classical', 'urban', 'modern', 'traditional', 'progressive', 'contemporary', 'industrial',
'experimental', 'tropical', 'ambient', 'melodic', 'underground', 'singer songwriter', 'musical',
'christian', 'islamic', 'african', 'hawaiian']

In [194]:
languages = {'asian' : ['chinese', 'japanese', 'korean', 'korean pop', 'taiwanese', 'vietnamese', 'malaysian', 'indonesian', 'thai', 'tibetan'],
             'baltic_slavic' : ['croatian', 'czech', 'latvian', 'polish', 'serbian', 'russian', 'lithuanian', 'ukrainian', 'slovenian', 'bulgarian'],
             'celtic' : ['irish', 'scottish', 'celtic'],
             'english' : ['australian', 'uk', 'british', 'canadian'],
             'germanic' : ['german', 'norwegian', 'swedish', 'dutch', 'icelandic', 'austrian', 'danish', 'belgian'],
             'indian_pakistani' : ['indian', 'pakistani', 'punjabi', 'hindustani'],
             'middle_eastern' : ['israeli', 'kurdish', 'hebrew', 'arab', 'turkish'],
             'romance' : ['spanish','french', 'italian', 'romanian', 'latin', 'portuguese'],
             'south_american' : ['brazilian', 'venezuelan', 'argentine', 'peruvian', 'chilean'],
             'uralic' : ['finnish', 'estonian', 'hungarian']}

# 'greek', 'albanian', 'armenian'

In [195]:
genre_lists = []

for i in user_tracks_df['genres']:

    temp = []
    
    for g in genres:
        if g in ' '.join(i):
            temp.append(g)
    
    for group in languages.keys():
        for l in languages[group]:
            if l in ' '.join(i):
                temp.append(group)
    
    if len(temp) == 0:
        temp.append('other')
    
    genre_lists.append(temp)

In [196]:
user_tracks_df['genres'] = genre_lists

### Get Album

In [197]:
def get_album_tracks(album_to_search):

    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

    df = None
    
    results2 = sp.album(album_id)

    album_tracks_dict = {}

    album_tracks_dict['track'] = [i['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['tracks']['items']]
    album_tracks_dict['track_id'] = [i['id'] for i in results2['tracks']['items']]
    album_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['tracks']['items']]
    album_tracks_dict['preview'] = [i['preview_url'] for i in results2['tracks']['items']]
    
    df = pd.DataFrame(album_tracks_dict)
    album_dict = {}
    
    album_dict['album_name'] = results2['name'] # album title
    album_dict['artist_name'] = results2['artists'][0]['name'] # artist name
    album_dict['release_year'] = results2['release_date'][:4] # release_year
    album_dict['total_tracks'] = results2['total_tracks'] # total tracks
    album_dict['album_link'] = results2['external_urls']['spotify']
    
    album_image = Image.open(requests.get(results2['images'][1]['url'], stream = True).raw)
    album_dict['album_image'] = album_image.resize((300, 300))
    
    minutes = sum([i['duration_ms'] for i in results2['tracks']['items']]) / 60000
    
    if minutes > 60:
        album_dict['album_duration'] = f'{int(minutes // 60)} hr {floor(minutes % 60)} min'
    else:
        album_dict['album_duration'] = f'{floor(minutes)} min {floor((minutes - floor(minutes)) * 60)} sec'
    
    df['album'] = album_dict['album_name']
    df['album_image_large'] = album_dict['album_image']
    df['release_date'] = results2['release_date']
    
#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

    results3 = sp.tracks(tracks = list(df['track_id']))
    
    df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

    df_audio = get_audio_features(df)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')

#XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
    
    df['data_type'] = 'album'
    
    df = df[['track', 'artist', 'album', 'track_id', 'artist_id', 'spotify_link', 'preview', 'album_image_large', 'data_type', 
             'duration_ms', 'popularity_artist', 'popularity_song', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 
             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 'time_signature', 'release_date', 'genres']]
                                 
    return df, album_dict

In [198]:
%%time
album_df, album_dict = get_album_tracks('Man on the Moon III: The Chosen')

CPU times: user 64.7 ms, sys: 6.85 ms, total: 71.6 ms
Wall time: 858 ms


In [199]:
album_dict

{'album_name': 'Man On The Moon III: The Chosen',
 'artist_name': 'Kid Cudi',
 'release_year': '2020',
 'total_tracks': 18,
 'album_link': 'https://open.spotify.com/album/64nbgEEIcY4g1ElVLONJ0w',
 'album_image': <PIL.Image.Image image mode=RGB size=300x300 at 0x7FD09B27E7D0>,
 'album_duration': '58 min 24 sec'}

In [200]:
album_df.shape

(18, 26)

In [201]:
album_df.head()

Unnamed: 0,track,artist,album,track_id,artist_id,spotify_link,preview,album_image_large,data_type,duration_ms,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,time_signature,release_date,genres
0,Beautiful Trip,Kid Cudi,Man On The Moon III: The Chosen,4IIuCotvqijraSdnVLaFnM,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/4IIuCotvqijraSd...,https://p.scdn.co/mp3-preview/05edcc06fc085a58...,<PIL.Image.Image image mode=RGB size=300x300 a...,album,37013,...,0.632,0.972,0.953,0.882,0.42,133.971,11,4,2020-12-11,"[hip hop, ohio hip hop, pop rap, rap]"
1,Tequila Shots,Kid Cudi,Man On The Moon III: The Chosen,30KctD1WsHKTIYczXjip5a,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...,<PIL.Image.Image image mode=RGB size=300x300 a...,album,193293,...,0.0531,0.084,5e-05,0.527,0.22,90.494,5,4,2020-12-11,"[hip hop, ohio hip hop, pop rap, rap]"
2,Another Day,Kid Cudi,Man On The Moon III: The Chosen,6myUpr3GDR80Dg3zqNTmmG,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...,<PIL.Image.Image image mode=RGB size=300x300 a...,album,199787,...,0.0708,0.556,0.00316,0.335,0.0642,172.995,4,4,2020-12-11,"[hip hop, ohio hip hop, pop rap, rap]"
3,She Knows This,Kid Cudi,Man On The Moon III: The Chosen,1xzUQMiCoY5pdego0pHMeV,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...,<PIL.Image.Image image mode=RGB size=300x300 a...,album,216560,...,0.0889,0.162,0.0,0.393,0.312,165.945,6,4,2020-12-11,"[hip hop, ohio hip hop, pop rap, rap]"
4,Dive,Kid Cudi,Man On The Moon III: The Chosen,7Hc3YL8oDiAzbiAW32KXrw,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...,<PIL.Image.Image image mode=RGB size=300x300 a...,album,148707,...,0.0679,0.67,0.0,0.36,0.332,123.105,9,4,2020-12-11,"[hip hop, ohio hip hop, pop rap, rap]"


### Export Dataframes

In [202]:
user_tracks_df.to_csv('.././datasets/aziz_tracks.csv', index = False)

In [203]:
album_df.to_csv('.././datasets/album.csv', index = False)