### Problem Statement

I am going to build a music recommender system using the Spotify and Genius APIs. The app will sort an ablum entered in the order of songs the user will mostly likely like by analyzing the audio features and lyrics of the users recently saved tracks, recently played tracks, and top tracks.

### Library/Packages Import

In [1]:
import pandas as pd
import numpy as np
from time import sleep
from re import sub

import spotipy
from spotipy.oauth2 import SpotifyOAuth

import lyricsgenius

pd.set_option('display.max_rows', None)

### API User Authorization

In [2]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-read-recently-played user-library-read user-top-read'

# 'user-read-recently-played' --> Get Current User's Recently Played Tracks
# 'user-top-read' --> Get a User's Top Artists and Tracks
# 'user-library-read' --> Get a User's Saved Tracks

In [3]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)
sp.current_user()

{'display_name': 'Aziz Maredia',
 'external_urls': {'spotify': 'https://open.spotify.com/user/1282060363'},
 'followers': {'href': None, 'total': 36},
 'href': 'https://api.spotify.com/v1/users/1282060363',
 'id': '1282060363',
 'images': [{'height': None,
   'url': 'https://scontent-ort2-1.xx.fbcdn.net/v/t1.0-1/p320x320/79640598_10158654118142079_1999166169432457216_n.jpg?_nc_cat=104&ccb=2&_nc_sid=0c64ff&_nc_ohc=IWsGI03k1KoAX_SYtbK&_nc_ht=scontent-ort2-1.xx&tp=6&oh=1ef4bb164e5474df038c5227bb32a125&oe=60200D29',
   'width': None}],
 'type': 'user',
 'uri': 'spotify:user:1282060363'}

### User Top Tracks

In [4]:
def get_top_tracks(var = None):
 
    df = None
    
    for i in range(0, 60, 20):
        
        results = sp.current_user_top_tracks(offset = i)

        top_tracks_dict = {}

        top_tracks_dict['track'] = [i['name'] for i in results['items']]
        top_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results['items']]
        top_tracks_dict['album'] = [i['album']['name'] for i in results['items']]
        top_tracks_dict['track_id'] = [i['id'] for i in results['items']]
        top_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results['items']]
#         top_tracks_dict['track_uri'] = [i['uri'] for i in results['items']]
        top_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results['items']]
        top_tracks_dict['preview'] = [i['preview_url'] for i in results['items']]
        top_tracks_dict['album_image_large'] = [i['album']['images'][1]['url'] for i in results['items']]
        top_tracks_dict['album_image_small'] = [i['album']['images'][2]['url'] for i in results['items']]
        top_tracks_dict['release_date'] = [i['album']['release_date'] for i in results['items']]
        top_tracks_dict['popularity_song'] = [i['popularity'] for i in results['items']]
       
        if i == 0:
            df = pd.DataFrame(top_tracks_dict)
        
        else:
            df = pd.concat([df, pd.DataFrame(top_tracks_dict)])
            df.reset_index(drop = True, inplace = True)
    
    return df

### User Current Saved

In [5]:
def get_current_saved(var = None):
    
    df = None
    
    for i in range(0, 1000, 20):
    
        results = sp.current_user_saved_tracks(offset = i)

        current_saved_dict = {}

        current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
        current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
        current_saved_dict['album'] = [i['track']['album']['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
#         current_saved_dict['track_uri'] = [i['track']['uri'] for i in results['items']]
        current_saved_dict['spotify_link'] = [i['track']['external_urls']['spotify'] for i in results['items']]
        current_saved_dict['preview'] = [i['track']['preview_url'] for i in results['items']]
        current_saved_dict['album_image_large'] = [i['track']['album']['images'][1]['url'] for i in results['items']]
        current_saved_dict['album_image_small'] = [i['track']['album']['images'][2]['url'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
        current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
    
    return df

### Get Tracks w/ Audio Features

In [6]:
def get_audio_features(all_tracks_df):
        
    df = None
    start = 0
    end = 100
    
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
    
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])

        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'time_signature'] # duration_ms removed

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

### Get Track Genres

In [7]:
def get_genres(all_tracks_df):
        
    df = None
    start = 0
    end = 50
    
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    
    for i in range((len(artists) // 50) + 1):
    
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        audio_dict = {}
        keep_features = ['id', 'popularity', 'genres']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
    df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

### Song Lyrics - Genius API

In [8]:
genius_cid = 'CcpHS1g5r7-WS44TjJ3sj6ArKwxRIOy8uBvBDMBR0nSqNqm5KrkR83hkoOkxBZpA'
genius_secret = 'ZnKfkN0tdVp8__7WTSkY4QnLtJaPz44lmG7iLjlnIXdLuZZrh0Bqud6FdERuq3uauroUBJGsnQqQCJOgnqwq7A'
token = '4rG9_aw6EWk-qxxhSwLhgoLi7KjjNJm-kERUuakfWtJQZLYr5pbLLE0mt8_5vBV7'

genius = lyricsgenius.Genius(token)

genius.remove_section_headers = True
genius.verbose = False

In [9]:
def get_lyrics(df):

    lyrics_lst = []

    for i in range(len(df.index)):
        song = genius.search_song(list(df['track'])[i], list(df['artist'])[i])

        try:
            if list(df['track'])[i] == song.title or list(df['artist'])[i] == song.artist:
                lyrics_lst.append(sub('\n', ' ', song.lyrics).lower())
            else:
                lyrics_lst.append(np.nan)
        except:
            lyrics_lst.append(np.nan)

        sleep(2)

    return lyrics_lst

### Get Tracks

In [10]:
def get_tracks_w_audio_features_lyrics(var = None):
    
    all_tracks_df = pd.concat([get_top_tracks(), get_current_saved()], axis = 0)
    all_tracks_df.drop_duplicates(inplace = True)
    all_tracks_df.reset_index(drop = True, inplace = True)
    
    audio_features_df = get_audio_features(all_tracks_df)
    
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    genres_df = get_genres(all_tracks_df)
    
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'user_library'
    
    return df

In [11]:
%%time
user_tracks_df = get_tracks_w_audio_features_lyrics()

CPU times: user 623 ms, sys: 48.8 ms, total: 672 ms
Wall time: 11.8 s


In [12]:
user_tracks_df.shape

(1005, 26)

In [13]:
user_tracks_df.head()

Unnamed: 0,track,artist,album,track_id,artist_id,spotify_link,preview,album_image_large,album_image_small,release_date,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity_artist,genres,data_type
0,ROXANNE,Arizona Zervas,ROXANNE,696DnlkuDOXcMAnKlTgXXK,0vRvGUQVUjytro0xpb26bs,https://open.spotify.com/track/696DnlkuDOXcMAn...,https://p.scdn.co/mp3-preview/17c8499c72603f1a...,https://i.scdn.co/image/ab67616d00001e02069a93...,https://i.scdn.co/image/ab67616d00004851069a93...,2019-10-10,...,0.148,0.0522,0.0,0.46,0.457,116.735,5,77,"[pop rap, rhode island rap]",user_library
1,Thank You,Dido,No Angel,3yUcJwYu7fXAfqMj9krY6l,2mpeljBig2IXLXRAFO9AAs,https://open.spotify.com/track/3yUcJwYu7fXAfqM...,https://p.scdn.co/mp3-preview/61c1b2d6f13f53af...,https://i.scdn.co/image/ab67616d00001e023e5cbf...,https://i.scdn.co/image/ab67616d000048513e5cbf...,1999,...,0.0424,0.3,0.000215,0.0665,0.765,79.984,4,72,"[dance pop, new wave pop, pop rock]",user_library
2,Climbing a Wall,Floral,Floral EP,2GZLWeDxVyCxDBdVzVjbMi,1FVOt1XlpnaCueBolWF92k,https://open.spotify.com/track/2GZLWeDxVyCxDBd...,https://p.scdn.co/mp3-preview/e9074cec5dbf81b0...,https://i.scdn.co/image/ab67616d00001e024ffed7...,https://i.scdn.co/image/ab67616d000048514ffed7...,2014-09-01,...,0.0436,0.0177,0.286,0.089,0.678,151.097,1,33,"[instrumental math rock, math pop]",user_library
3,BLIND (feat. Young Thug),DaBaby,BLAME IT ON BABY (DELUXE),2T5NBwKRySiCR78vVk08vr,4r63FhuTkUYltbVAg5TQnk,https://open.spotify.com/track/2T5NBwKRySiCR78...,https://p.scdn.co/mp3-preview/0d8eb3d8a247f65d...,https://i.scdn.co/image/ab67616d00001e02e818d0...,https://i.scdn.co/image/ab67616d00004851e818d0...,2020-08-04,...,0.346,0.0115,0.0,0.0971,0.754,162.973,4,94,"[north carolina hip hop, rap]",user_library
4,Tequila Shots,Kid Cudi,Man On The Moon III: The Chosen,30KctD1WsHKTIYczXjip5a,0fA0VVWsXO9YnASrzqfmYu,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...,https://i.scdn.co/image/ab67616d00001e026f43a6...,https://i.scdn.co/image/ab67616d000048516f43a6...,2020-12-11,...,0.0531,0.084,5e-05,0.527,0.22,90.494,4,90,"[hip hop, ohio hip hop, pop rap, rap]",user_library


### Get Album

In [14]:
def get_album_tracks(album_to_search):

    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']
    album_name = results['albums']['items'][0]['name']
    album_image_large = results['albums']['items'][0]['images'][1]['url']
    album_image_small = results['albums']['items'][0]['images'][2]['url']
    release_date = results['albums']['items'][0]['release_date']
    
    df = None
    
    results2 = sp.album_tracks(album_id)

    top_tracks_dict = {}

    top_tracks_dict['track'] = [i['name'] for i in results2['items']]
    top_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['items']]
    top_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['items']]
    top_tracks_dict['track_id'] = [i['id'] for i in results2['items']]
#     top_tracks_dict['track_uri'] = [i['uri'] for i in results2['items']]
    top_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['items']]
    top_tracks_dict['preview'] = [i['preview_url'] for i in results2['items']]
    
    df = pd.DataFrame(top_tracks_dict)
    
    df['album'] = album_name
    df['album_image_large'] = album_image_large
    df['album_image_small'] = album_image_small
    df['release_date'] = release_date
    
    
    results3 = sp.tracks(tracks = list(df['track_id']))
    df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]
    
    df = df[['track', 'artist', 'album', 'track_id', 'artist_id', 'spotify_link', 'preview', 'album_image_large', 'album_image_small', 'release_date', 'popularity_song']]
    
    
    df_audio = get_audio_features(df)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'album'
                                 
    return df

In [15]:
album = get_album_tracks('Future Nostalgia')

In [16]:
album.shape

(11, 26)

In [17]:
album.head()

Unnamed: 0,track,artist,album,track_id,artist_id,spotify_link,preview,album_image_large,album_image_small,release_date,...,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity_artist,genres,data_type
0,Future Nostalgia,Dua Lipa,Future Nostalgia,6zrJLhslleCHby0wbUnvVy,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/6zrJLhslleCHby0...,https://p.scdn.co/mp3-preview/3a6c52b19be10e31...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.00618,0.0,0.321,0.859,114.993,184882,4,93,"[dance pop, pop, pop dance, uk pop]",album
1,Don't Start Now,Dua Lipa,Future Nostalgia,3PfIrDoz19wz7qK7tYeu62,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/3PfIrDoz19wz7qK...,https://p.scdn.co/mp3-preview/ccada72f07a920a7...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.0123,0.0,0.0951,0.679,123.95,183290,4,93,"[dance pop, pop, pop dance, uk pop]",album
2,Cool,Dua Lipa,Future Nostalgia,2nMOodYNHBAQ3Kc1QNimZU,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/2nMOodYNHBAQ3Kc...,https://p.scdn.co/mp3-preview/f126ab707eae598b...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.133,2e-06,0.0931,0.79,89.717,209583,4,93,"[dance pop, pop, pop dance, uk pop]",album
3,Physical,Dua Lipa,Future Nostalgia,3AzjcOeAmA57TIOr9zF1ZW,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/3AzjcOeAmA57TIO...,https://p.scdn.co/mp3-preview/fa1eb265d7f5a5c0...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.0137,0.000658,0.102,0.746,146.967,193829,4,93,"[dance pop, pop, pop dance, uk pop]",album
4,Levitating,Dua Lipa,Future Nostalgia,39LLxExYz6ewLAcYrzQQyP,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/39LLxExYz6ewLAc...,https://p.scdn.co/mp3-preview/15e65d052d726b0b...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.0561,0.0,0.213,0.914,103.014,203808,4,93,"[dance pop, pop, pop dance, uk pop]",album


### Get Artist

In [18]:
results = sp.search('AC Slater', limit = 50, offset = 0, type = 'track')

In [19]:
results['tracks']['items'][0]['name']

'Fly Kicks - Wax Motif Remix'

In [20]:
results['tracks']['items'][0]['id']

'30Ek6vDZ77PWTQS9HiV3xB'

In [21]:
results['tracks']['items'][0]['uri']

'spotify:track:30Ek6vDZ77PWTQS9HiV3xB'

In [22]:
def get_artist_tracks(artist_to_search):
    
    df = None
    
    for i in range(0, 150, 50):
    
        results = sp.search(artist_to_search, limit = 50, offset = i, type = 'track')

        current_saved_dict = {}

        current_saved_dict['track'] = [i['name'] for i in results['tracks']['items']]
        current_saved_dict['artist'] = [i['artists'][0]['name'] for i in results['tracks']['items']]
        current_saved_dict['album'] = [i['album']['name'] for i in results['tracks']['items']]   
        current_saved_dict['track_id'] = [i['id'] for i in results['tracks']['items']]
        current_saved_dict['artist_id'] = [i['artists'][0]['id'] for i in results['tracks']['items']]
#         current_saved_dict['track_uri'] = [i['uri'] for i in results['tracks']['items']]
        current_saved_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results['tracks']['items']]
        current_saved_dict['preview'] = [i['preview_url'] for i in results['tracks']['items']] 
        current_saved_dict['album_image_large'] = [i['album']['images'][1]['url'] for i in results['tracks']['items']]
        current_saved_dict['album_image_small'] = [i['album']['images'][2]['url'] for i in results['tracks']['items']]
        current_saved_dict['release_date'] = [i['album']['release_date'] for i in results['tracks']['items']]
        current_saved_dict['popularity_song'] = [i['popularity'] for i in results['tracks']['items']]
        
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)


    df = df.loc[df['artist'].str.contains(artist_to_search, case = False) | df['track'].str.contains(artist_to_search, case = False)]
    df.drop_duplicates(subset = ['track'], inplace = True)
    
    df.reset_index(drop = True, inplace = True)
    
    df_audio = get_audio_features(df)
    
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')

    return df

In [23]:
artist = get_artist_tracks('Dua Lipa')

In [24]:
artist.shape

(95, 26)

In [25]:
artist.head()

Unnamed: 0,track,artist,album,track_id,artist_id,spotify_link,preview,album_image_large,album_image_small,release_date,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,popularity_artist,genres
0,Levitating (feat. DaBaby),Dua Lipa,Levitating (feat. DaBaby),463CkQjx2Zk1yXoBuierM9,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/463CkQjx2Zk1yXo...,https://p.scdn.co/mp3-preview/cc617f669fd1e3ee...,https://i.scdn.co/image/ab67616d00001e0249caa4...,https://i.scdn.co/image/ab67616d0000485149caa4...,2020-10-01,...,0.0601,0.00883,0.0,0.0674,0.915,102.977,203064,4,93,"[dance pop, pop, pop dance, uk pop]"
1,Prisoner (feat. Dua Lipa),Miley Cyrus,Prisoner (feat. Dua Lipa),5JqZ3oqF00jkT81foAFvqg,5YGY8feqx7naU7z4HrwZM6,https://open.spotify.com/track/5JqZ3oqF00jkT81...,https://p.scdn.co/mp3-preview/993e160238e38b9d...,https://i.scdn.co/image/ab67616d00001e02379725...,https://i.scdn.co/image/ab67616d00004851379725...,2020-11-20,...,0.0452,0.0103,0.0,0.0761,0.595,127.99,169333,4,91,"[dance pop, pop, pop dance, post-teen pop]"
2,Don't Start Now,Dua Lipa,Don't Start Now,6WrI0LAC5M1Rw2MnX2ZvEg,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/6WrI0LAC5M1Rw2M...,https://p.scdn.co/mp3-preview/ed151225213380a4...,https://i.scdn.co/image/ab67616d00001e028583df...,https://i.scdn.co/image/ab67616d000048518583df...,2019-10-31,...,0.0842,0.0125,0.0,0.0952,0.677,123.941,183290,4,93,"[dance pop, pop, pop dance, uk pop]"
3,One Kiss (with Dua Lipa),Calvin Harris,One Kiss (with Dua Lipa),7ef4DlsgrMEH11cDZd32M6,7CajNmpbOovFoOoasH2HaY,https://open.spotify.com/track/7ef4DlsgrMEH11c...,https://p.scdn.co/mp3-preview/34b3b95afb8e1d34...,https://i.scdn.co/image/ab67616d00001e02d09f96...,https://i.scdn.co/image/ab67616d00004851d09f96...,2018-04-06,...,0.11,0.037,2.2e-05,0.0814,0.592,123.994,214847,4,86,"[dance pop, edm, electro house, house, pop, po..."
4,Break My Heart,Dua Lipa,Future Nostalgia,017PF4Q3l4DBUiWoXk4OWT,6M2wZ9GZgrQXHCFfjv46we,https://open.spotify.com/track/017PF4Q3l4DBUiW...,https://p.scdn.co/mp3-preview/5e535e80aa3fadc2...,https://i.scdn.co/image/ab67616d00001e02cda2c6...,https://i.scdn.co/image/ab67616d00004851cda2c6...,2020-03-27,...,0.0883,0.167,1e-06,0.349,0.467,113.013,221820,4,93,"[dance pop, pop, pop dance, uk pop]"


In [26]:
not_in_user_library = list(set(artist['track']) - set(user_tracks_df['track']))

In [27]:
in_user_library = list(set(artist['track']) - set(not_in_user_library))

In [28]:
for i in in_user_library:
    artist.drop(artist.loc[artist['track'] == i].index, inplace = True)
    artist.reset_index(drop = True, inplace = True)

In [29]:
artist.shape

(92, 26)

### Export Dataframes

In [17]:
user_tracks_df.to_csv('.././datasets/aziz_tracks.csv', index = False)

In [18]:
album.to_csv('.././datasets/album.csv', index = False)

In [32]:
artist.to_csv('.././datasets/artist.csv', index = False)