In [1]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Load Existing Lyrics Data

In [10]:
lyrics_data = pd.read_csv("latest_lyrics_data.csv")

In [11]:
lyrics_data.head()

Unnamed: 0,album,artist_name,id,song_name,song_uri,track_number,accousticness,danceability,energy,instrumentalness,liveness,loudness,popularity,speechiness,tempo,uri,valence,Lyrics,lyrics_cleaned
0,Back 2 The Beach,Yung Pinch,43HRfCLhkeRRNGDGoVHoW3,Toast To Us,spotify:track:43HRfCLhkeRRNGDGoVHoW3,4.0,0.37,0.347,0.474,0.0,0.405,-7.638,0,0.0704,150.705,spotify:track:43HRfCLhkeRRNGDGoVHoW3,0.298,"[Chorus] If your curious, then ask about it Th...",If your curious then ask about it The floor i...
1,Back 2 The Beach,Yung Pinch,0ViNQ4VC3ebjc8idnu8NXB,The Truth,spotify:track:0ViNQ4VC3ebjc8idnu8NXB,7.0,0.722,0.698,0.482,0.0,0.122,-8.286,0,0.122,116.786,spotify:track:0ViNQ4VC3ebjc8idnu8NXB,0.592,"[Intro] Oh-oh woah Oh-oh, woah Beach Boy in th...",Ohoh woah Ohoh woah Beach Boy in this ho Oh m...
2,Back 2 The Beach,Yung Pinch,0CQJtgecxiDocebqiU8Iks,Big Bags,spotify:track:0CQJtgecxiDocebqiU8Iks,10.0,0.0265,0.737,0.458,0.0,0.102,-7.06,0,0.0776,141.947,spotify:track:0CQJtgecxiDocebqiU8Iks,0.377,"[Intro] Ricky Racks, I see you Ooh yeah, yeah,...",Ricky Racks I see you Ooh yeah yeah I need bi...
3,Back 2 The Beach,Yung Pinch,7FDFdjlcZzJ4CU3Ze86btY,Punk Heaven,spotify:track:7FDFdjlcZzJ4CU3Ze86btY,14.0,0.0721,0.737,0.425,4e-06,0.0818,-7.319,0,0.0365,130.043,spotify:track:7FDFdjlcZzJ4CU3Ze86btY,0.086,[Chorus] We got an idiot as president Someone ...,We got an idiot as president Someone put an e...
4,4EVERFRIDAY SZN ONE,Yung Pinch,4pNraVNwNG6pCKBDcs8ELM,"Another Day, Another Dollar",spotify:track:4pNraVNwNG6pCKBDcs8ELM,1.0,0.0926,0.631,0.649,1e-05,0.112,-6.808,36,0.036,129.997,spotify:track:4pNraVNwNG6pCKBDcs8ELM,0.259,"[Intro] Yeah, yeah Hey, hold on Hey, yeah, loo...",Yeah yeah Hey hold on Hey yeah look Another...


# Load New Song/Artist
 - Get song_uri(s) from Spotify API
 - Get audio_features from Spotify API
 - Get lyrics data from Genius API
 - Clean lyrics

### Set up Spotipy Client

In [13]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


# Spotify developer page: https://developer.spotify.com/dashboard/applications/7085a21ce4124b3e89db61d750b133a7
client_id = '7085a21ce4124b3e89db61d750b133a7'
client_secret = '2b02da51f99f4470a1c2ef91f28a0957'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API #chosen artist

In [14]:
'A'.lower()

'a'

In [12]:
def get_artist_uri(artist_name):
    result = sp.search(artist_name) #search query
    for i in result['tracks']['items']:
        if (i['artists'][0]['name'] == artist_name):
            return i['artists'][0]['uri']
    print("Could not find uri for {}".format(artist_name))
    return None

def get_album_info_for_artist(name):
    #Extract Artist's uri
    artist_uri = get_artist_uri(name)
    #Pull all of the artist's albums
    sp_albums = sp.artist_albums(artist_uri, album_type='album')
    #Store artist's albums' names' and uris in separate lists
    album_names = []
    album_uris = []
    for i in range(len(sp_albums['items'])):
        album_names.append(sp_albums['items'][i]['name'])
        album_uris.append(sp_albums['items'][i]['uri'])

    return album_uris, album_names

def get_genres_from_artist(artist_name):
    artist_uri = get_artist_uri(artist_name)
    return sp.artist(artist_uri)['genres']

def get_album_tracks(album_uri, album_name):
    #Create keys-values of empty lists inside nested dictionary for album
    album_dict = dict()
    album_dict['album'] = [] #create empty list
    album_dict['track_number'] = []
    album_dict['id'] = []
    album_dict['song_name'] = []
    album_dict['song_uri'] = []
    tracks = sp.album_tracks(album_uri) #pull data on album tracks
    for n in range(len(tracks['items'])): #for each song track
        album_dict['album'].append(album_name) #append album name tracked via album_count
        album_dict['track_number'].append(tracks['items'][n]['track_number'])
        album_dict['id'].append(tracks['items'][n]['id'])
        album_dict['song_name'].append(tracks['items'][n]['name'])
        album_dict['song_uri'].append(tracks['items'][n]['uri'])
    return album_dict

def get_song_metadata_from_album(album_uris, album_names):
    spotify_albums = dict()
    for uri, name in zip(album_uris, album_names): 
        spotify_albums[uri] = get_album_tracks(uri, name)

    track_data = pd.DataFrame()
    for k in spotify_albums.keys():
        track_data = track_data.append(pd.DataFrame(spotify_albums[k]))
    return track_data

def build_artist_genre_data(artist_list):
    artist_genres = pd.DataFrame()
    for artist in tqdm(artist_list): 
        try:
            artist_genres = artist_genres.append(pd.DataFrame({'artist': artist, 'genres': get_genres_from_artist(artist)}))
        except Exception: 
            ""
    return artist_genres

def build_artist_song_dataset(artist_list):
    song_metadata = pd.DataFrame()
    artist_genres = pd.DataFrame()
    for artist in tqdm(artist_list):
        try:
            album_uris, album_names = get_album_info_for_artist(artist)
            curr_song_metadata = get_song_metadata_from_album(album_uris, album_names).assign(artist_name = lambda x: artist)
            song_metadata = song_metadata.append(curr_song_metadata)
            artist_genres = artist_genres.append(get_genres_from_artist(artist))
            print("Data for {} albums and genres has been collected".format(artist))
        except Exception: 
            print("Failed to collect data for {} albums and genres".format(artist))
    return song_metadata, artist_genres

def audio_features(uris):
    # Add new key-values to store audio features
    acousticness = []
    danceability = []
    energy = []
    instrumentalness = []
    liveness = []
    loudness = []
    speechiness = []
    tempo = []
    valence = []
    popularity = []
    # Create a track counter
    count = 0
    for track in tqdm(uris):
        # Pull audio features per track
        features = sp.audio_features(track)
        if features != [None]:
            # Append to relevant key-value
            acousticness.append(features[0]['acousticness'])
            danceability.append(features[0]['danceability'])
            energy.append(features[0]['energy'])
            instrumentalness.append(features[0]['instrumentalness'])
            liveness.append(features[0]['liveness'])
            loudness.append(features[0]['loudness'])
            speechiness.append(features[0]['speechiness'])
            tempo.append(features[0]['tempo'])
            valence.append(features[0]['valence'])
            # Popularity is stored elsewhere
            pop = sp.track(track)
            popularity.append(pop['popularity'])
            count = count + 1
            if count % 100 == 0: 
                pd.DataFrame({'uri': uris[:count], 
                'danceability': danceability, 
                'energy': energy, 
                'accousticness': acousticness,
                'instrumentalness': instrumentalness,
                'liveness': liveness, 
                'loudness': loudness, 
                'speechiness': speechiness, 
                'tempo': tempo, 
                'valence': valence, 
                'popularity': popularity}).to_csv("subsampled_audio_features_p3.csv")
    return pd.DataFrame({'uri': uris, 
            'danceability': danceability, 
            'energy': energy, 
            'accousticness': acousticness,
            'instrumentalness': instrumentalness,
            'liveness': liveness, 
            'loudness': loudness, 
            'speechiness': speechiness, 
            'tempo': tempo, 
            'valence': valence, 
            'popularity': popularity})