# Purpose

This notebook is to enrich the Last FM music files with the audio features data from Spotify. Go to https://developer.spotify.com/ for more information

# Loading Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import swifter
import os
import gc
import glob
import shutil

In [2]:
import re
def DropAllNullColumns(data):
    columnsToDrop = []
    for column in data.columns:
        if data[column].isnull().values.all():
            columnsToDrop.append(column)

    print('Dropping all the following columns since all NaN values')
    print(columnsToDrop)
    data.drop(columnsToDrop, axis = 1, inplace = True)
    
def UpperCaseStringColumns(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            print(column + ': Upper Casing')
            data[column] = data[column].str.upper()
            
def CompressIntegerColumns(data):
    for column in data.columns:
        if (np.issubdtype(data[column].dtype, np.integer)):       
            minValue = data[column].min()
            maxValue = data[column].max()
            
            info = np.iinfo
            if minValue >= 0:
                types = (np.uint8, np.uint16, np.uint32, np.uint64)
            else:
                types = (np.int8, np.int16, np.int32, np.int64)

            for t in types:
                if info(t).min <= minValue and maxValue <= info(t).max:
                    print(str(column) + ': Converting to ' + t.__name__)
                    data[column] = data[column].astype(t)
                    break
                
def ConvertFloatColumnsToIntegerIfNoDataLoss(data):
    for column in data.columns:
        try:
            if (np.issubdtype(data[column].dtype, np.float)):
                temp = data[column].astype(np.int64)

                if ((temp == data[column]).all()):
                    print(column + ': Converting to ' + str(temp.dtype))
                    data[column] = temp
        except:
            pass

def ConvertStringColumnsToInt(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            if data[column].isnull().values.any():
                continue

            if (data[column].apply(lambda x: re.match('^[0-9,-]+$', x) != None).all()):
                print(column + ': Converting to int')
                data[column] = data[column].str.replace(',', '')
                data[column] = data[column].astype(np.int64) 

def ConvertStringColumnsToFloat(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            if data[column].isnull().values.any():
                continue

            if (data[column].apply(lambda x: re.match('^[0-9,-\.]+$', x) != None).all()):
                print(column + ': Converting to float')
                data[column] = data[column].str.replace(',', '')
                data[column] = data[column].astype(np.float64)
                                
def InspectColumnValues(data):
    for column in data.columns:
        try:
            values = data[column].unique()
            print(column + ': ' + str(len(values)))
            print(values[0:10])
            print()
        except:
            print('Error with: ' + column)
            
def SaveData(data, name):
    data = data.reset_index(drop = True)
    data.columns = data.columns.str.replace('_', ' ').str.title()
    if (os.path.exists('../../data/') == False):
        os.makedirs('../../data/')
        
    data.to_parquet('../../data/' + name + '.gzip.parquet', compression = 'gzip')
    return pd.read_parquet('../../data/' + name + '.gzip.parquet')

# Loading Data

## List of music we want to get details from Spotify 

Combining all the age group files into one dataset to work with

In [3]:
userSessionLFM = pd.read_parquet('../../data/LastFM1bKidListeningEventsWithUsers', columns = ['Artist', 'Album',  'Track'])
userSessionLFM.drop(columns = 'Partition', inplace = True)
userSessionLFM.drop_duplicates(inplace = True, ignore_index = True)
userSessionLFM.head(5)

Unnamed: 0,Artist,Album,Track
0,I BREAK HORSES,CHIAROSCURO,FAITH
1,TAYLOR SWIFT,FEARLESS,FEARLESS
2,HIM,TEARS ON TAPE,W.L.S.T.D.
3,AMARANTHE,THE NEXUS,INFINITY
4,AMARANTHE,THE NEXUS,INVINCIBLE


# Mass Loading Spotify Data

"Exact" search match only. Not modifing the data at all

**Note:** Using spoti**p**y library, not spoti**f**y library

In [4]:
import spotipy
def GetSpotifyClient():
    token = spotipy.oauth2.SpotifyClientCredentials(client_id='{Your Client ID}', client_secret='Your Client Secret')

    cache_token = token.get_access_token(as_dict=False)
    return spotipy.Spotify(cache_token)

def UpdateDataWithAnyTemporaryDataFromPreviousRun(data, dataFileName, tempDirectory):
    if (os.path.exists(tempDirectory)):
        temp = []
        for file in glob.glob(os.path.join(tempDirectory, '*.gzip.parquet')):
            temp.append(pd.read_parquet(file))

        if (len(temp) != 0):
            temp.append(data)
            data = pd.concat(temp).drop_duplicates()
            data.to_parquet(dataFileName, compression = 'gzip')
    
        shutil.rmtree(tempDirectory)  
            
    return data

spotifyClient = GetSpotifyClient()

## Finding Spotify Artist Name

### Loading previous cached data

In [5]:
tempDirectory = 'temp/tracks'
spotifyTrackFileName = 'spotifyTracks.gzip.parquet'

if (os.path.exists(spotifyTrackFileName)):
    spotifyTrack = pd.read_parquet(spotifyTrackFileName)
else:
    spotifyTrack = pd.DataFrame(columns = ['Artist', 'Album', 'Track', 'Spotify Track Uri'])
    spotifyTrack.to_parquet(spotifyTrackFileName, compression = 'gzip')


spotifyTrack = UpdateDataWithAnyTemporaryDataFromPreviousRun(spotifyTrack, spotifyTrackFileName, tempDirectory)
os.makedirs(tempDirectory)

### Filtering out any data already found

In [6]:
def IsInTrackCache(artist, album, track):
    try:
        return track in cache[artist, album]
    except KeyError:
        return False
    
cache = spotifyTrack.groupby(['Artist', 'Album'], observed = True)['Track'].apply(list).to_dict()
needToLookUp = userSessionLFM[userSessionLFM.swifter.apply(lambda x: IsInTrackCache(x['Artist'], x['Album'], x['Track']) == False, axis = 'columns')]

Pandas Apply:   0%|          | 0/3105184 [00:00<?, ?it/s]

### Looking up missing data

In [7]:
def FindSpotifyTrack(artistName, albumName, trackName):
    errorCount = 0
    global spotifyClient
    while (errorCount < 3):
        try:
            if (albumName == ''):
                result = spotifyClient.search('artist:' + artistName + ' ' + trackName, type='track')
            else:
                result = spotifyClient.search('artist:' + artistName + ' album:' + albumName + ' ' + trackName, type='track')

            if (len(result['tracks']['items']) == 0):
                return [artistName, albumName, trackName, 'No Spotify Match']

            return [artistName, albumName, trackName, result['tracks']['items'][0]['uri']]
        
        except Exception as e:
            spotifyClient = GetSpotifyClient()
            print('Error')
            print(e)
            errorCount += 1
            
    return [artistName, albumName, trackName, 'Error']

lookUpRound = 1
needToLookUpChunks = [needToLookUp[i:i + 1000] for i in range(0, needToLookUp.shape[0], 1000)]
maxRounds = len(needToLookUpChunks)
for lookupThisRound in needToLookUpChunks:
    print('Working on round ' + str(lookUpRound) + ' of ' + str(len(needToLookUpChunks)))
    spotifyClient = GetSpotifyClient()
    foundThisRound = pd.DataFrame(lookupThisRound).apply(lambda row: FindSpotifyTrack(row['Artist'], row['Album'], row['Track']), axis = 1)
    foundThisRound = pd.DataFrame.from_records(list(foundThisRound), columns=['Artist', 'Album', 'Track', 'Spotify Track Uri'])
    foundThisRound.to_parquet(os.path.join(tempDirectory, str(lookUpRound) + '.gzip.parquet'), compression = 'gzip')
    lookUpRound = lookUpRound + 1

In [8]:
    
spotifyTrack = UpdateDataWithAnyTemporaryDataFromPreviousRun(spotifyTrack, spotifyTrackFileName, tempDirectory)
cache = None

## Finding Spotify Track Audio Features

### Loading previous cached data

In [9]:
tempDirectory = 'temp/audioFeatures'
spotifyAudioFeaturesFileName = 'spotifyAudioFeatures.gzip.parquet'

if (os.path.exists(spotifyAudioFeaturesFileName)):
    spotifyAudioFeatures = pd.read_parquet(spotifyAudioFeaturesFileName)
else:
    spotifyAudioFeatures = pd.DataFrame(columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness'
                                                   , 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'
                                                   , 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms'
                                                   , 'time_signature'])
    spotifyAudioFeatures.to_parquet(spotifyAudioFeaturesFileName, compression = 'gzip')

spotifyAudioFeatures = UpdateDataWithAnyTemporaryDataFromPreviousRun(spotifyAudioFeatures, spotifyAudioFeaturesFileName, tempDirectory)

### Filtering out any data already found

In [10]:
needToLookUp = spotifyTrack[spotifyTrack['Spotify Track Uri'].str.contains('spotify:track:')]['Spotify Track Uri'].drop_duplicates().to_frame()
needToLookUp = needToLookUp[(needToLookUp['Spotify Track Uri'].isin(spotifyAudioFeatures['uri']) == False)]

### Looking up missing data

In [11]:
def FindSpotifyAudioFeatures(trackUris):
    errorCount = 0
    global spotifyClient
    while (errorCount < 3):
        try:
            return pd.DataFrame.from_records(filter(None, spotifyClient.audio_features(trackUris)))
            
        except Exception as e:
            spotifyClient = GetSpotifyClient()
            print('Error')
            print(e)
            errorCount += 1
            
    return pd.DataFrame()

lookUpRound = 1
needToLookUpChunks = [needToLookUp[i:i + 100] for i in range(0, needToLookUp.shape[0], 100)]
maxRounds = str(len(needToLookUpChunks))
for lookupThisRound in needToLookUpChunks:
    if (lookUpRound % 100 == 0):
        print('Working on round ' + str(lookUpRound) + ' of ' + maxRounds)

    temp = FindSpotifyAudioFeatures(lookupThisRound)
    if (temp.empty == False):
        temp.to_parquet(os.path.join(tempDirectory, str(lookUpRound) + '.gzip.parquet'), compression = 'gzip')
        
    lookUpRound = lookUpRound + 1

spotifyAudioFeatures = UpdateDataWithAnyTemporaryDataFromPreviousRun(spotifyAudioFeatures, spotifyAudioFeaturesFileName, tempDirectory)

Working on round 100 of 736
Working on round 200 of 736
Working on round 300 of 736
Working on round 400 of 736
Working on round 500 of 736
Working on round 600 of 736
Working on round 700 of 736


### Compressing Audio Features

In [12]:
spotifyAudioFeatures.info()
spotifyAudioFeatures.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800754 entries, 0 to 1087150
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   danceability      800754 non-null  float64
 1   energy            800754 non-null  float64
 2   key               800754 non-null  int64  
 3   loudness          800754 non-null  float64
 4   mode              800754 non-null  int64  
 5   speechiness       800754 non-null  float64
 6   acousticness      800754 non-null  float64
 7   instrumentalness  800754 non-null  float64
 8   liveness          800754 non-null  float64
 9   valence           800754 non-null  float64
 10  tempo             800754 non-null  float64
 11  type              800754 non-null  object 
 12  id                800754 non-null  object 
 13  uri               800754 non-null  object 
 14  track_href        800754 non-null  object 
 15  analysis_url      800754 non-null  object 
 16  duration_ms       8

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.803,0.876,6,-8.186,1,0.0385,0.0238,0.617,0.0432,0.881,114.983,audio_features,3LgIR6H0HUXu0HiTLSF1zj,spotify:track:3LgIR6H0HUXu0HiTLSF1zj,https://api.spotify.com/v1/tracks/3LgIR6H0HUXu...,https://api.spotify.com/v1/audio-analysis/3LgI...,277133,4
1,0.711,0.893,9,-6.059,1,0.0781,0.00232,0.196,0.642,0.629,120.751,audio_features,32TXsOUIWtTQDkE72Cvafe,spotify:track:32TXsOUIWtTQDkE72Cvafe,https://api.spotify.com/v1/tracks/32TXsOUIWtTQ...,https://api.spotify.com/v1/audio-analysis/32TX...,351093,4
2,0.256,0.895,2,-4.86,1,0.0707,0.0131,0.000106,0.0821,0.555,191.307,audio_features,000G1xMMuwxNHmwVsBdtj1,spotify:track:000G1xMMuwxNHmwVsBdtj1,https://api.spotify.com/v1/tracks/000G1xMMuwxN...,https://api.spotify.com/v1/audio-analysis/000G...,182347,4
4,0.797,0.898,1,-5.922,0,0.52,0.469,0.0,0.0824,0.146,89.926,audio_features,000GyYHG4uWmlXieKLij8u,spotify:track:000GyYHG4uWmlXieKLij8u,https://api.spotify.com/v1/tracks/000GyYHG4uWm...,https://api.spotify.com/v1/audio-analysis/000G...,180160,4
5,0.201,0.886,0,-7.337,1,0.118,3e-05,0.189,0.677,0.497,153.811,audio_features,000H1qKRnRjZDH1NcG2OsL,spotify:track:000H1qKRnRjZDH1NcG2OsL,https://api.spotify.com/v1/tracks/000H1qKRnRjZ...,https://api.spotify.com/v1/audio-analysis/000H...,228000,4


In [13]:
CompressIntegerColumns(spotifyAudioFeatures)
spotifyAudioFeatures.drop(columns = ['type', 'id', 'track_href', 'analysis_url'], inplace = True)
spotifyAudioFeatures.info()

key: Converting to uint8
mode: Converting to uint8
duration_ms: Converting to uint32
time_signature: Converting to uint8
<class 'pandas.core.frame.DataFrame'>
Int64Index: 800754 entries, 0 to 1087150
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   danceability      800754 non-null  float64
 1   energy            800754 non-null  float64
 2   key               800754 non-null  uint8  
 3   loudness          800754 non-null  float64
 4   mode              800754 non-null  uint8  
 5   speechiness       800754 non-null  float64
 6   acousticness      800754 non-null  float64
 7   instrumentalness  800754 non-null  float64
 8   liveness          800754 non-null  float64
 9   valence           800754 non-null  float64
 10  tempo             800754 non-null  float64
 11  uri               800754 non-null  object 
 12  duration_ms       800754 non-null  uint32 
 13  time_signature    800754 non-null  uint8  

## Building and Saving Enriched Dataset

In [14]:
musicEnriched = pd.read_parquet('../../data/LastFM1bKidListeningEventsWithUsers')
musicEnriched.drop(columns = 'Partition', inplace = True)
musicEnriched.head(5)

Unnamed: 0,Age,Education Level,Artist,Album,Track,User Id,Artist Id,Album Id,Track Id,Event Unixtime
0,16,HS,I BREAK HORSES,CHIAROSCURO,FAITH,15397460,15610,33142,99816,1391098195
1,16,HS,TAYLOR SWIFT,FEARLESS,FEARLESS,15397460,3744,30727,99833,1374858428
2,16,HS,HIM,TEARS ON TAPE,W.L.S.T.D.,15397460,10028,33156,99851,1369547268
3,16,HS,AMARANTHE,THE NEXUS,INFINITY,15397460,104,33147,99875,1365511561
4,16,HS,AMARANTHE,THE NEXUS,INVINCIBLE,15397460,104,33147,99823,1365509450


In [15]:
musicEnriched = musicEnriched.merge(spotifyTrack[spotifyTrack['Spotify Track Uri'].str.contains('spotify:track:')], on = ['Artist', 'Album', 'Track'], how = 'left')
musicEnriched.head(5)

Unnamed: 0,Age,Education Level,Artist,Album,Track,User Id,Artist Id,Album Id,Track Id,Event Unixtime,Spotify Track Uri
0,16,HS,I BREAK HORSES,CHIAROSCURO,FAITH,15397460,15610,33142,99816,1391098195,spotify:track:30jk1zNmGlphOtsns69775
1,16,HS,TAYLOR SWIFT,FEARLESS,FEARLESS,15397460,3744,30727,99833,1374858428,spotify:track:2CYVETnhM9aytqrazYYwrK
2,16,HS,HIM,TEARS ON TAPE,W.L.S.T.D.,15397460,10028,33156,99851,1369547268,spotify:track:2FU6qFtXFKN7OO1sMDf2un
3,16,HS,AMARANTHE,THE NEXUS,INFINITY,15397460,104,33147,99875,1365511561,spotify:track:5hD3s7lP20vLae5jeFED01
4,16,HS,AMARANTHE,THE NEXUS,INVINCIBLE,15397460,104,33147,99823,1365509450,spotify:track:0R3mu0dOKOe2r1EDOca39U


In [16]:
musicEnriched = musicEnriched.merge(spotifyAudioFeatures, left_on = ['Spotify Track Uri'], right_on = ['uri'], how = 'left')
musicEnriched.drop(columns = ['Spotify Track Uri', 'uri'], inplace = True)
musicEnriched['Artist'] = musicEnriched['Artist'].astype('category')
musicEnriched['Album'] = musicEnriched['Album'].astype('category')
musicEnriched['Track'] = musicEnriched['Track'].astype('category')
musicEnriched.info()
musicEnriched.head(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37096515 entries, 0 to 37096514
Data columns (total 23 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Age               uint8   
 1   Education Level   category
 2   Artist            category
 3   Album             category
 4   Track             category
 5   User Id           uint32  
 6   Artist Id         uint32  
 7   Album Id          uint32  
 8   Track Id          uint32  
 9   Event Unixtime    uint32  
 10  danceability      float64 
 11  energy            float64 
 12  key               float64 
 13  loudness          float64 
 14  mode              float64 
 15  speechiness       float64 
 16  acousticness      float64 
 17  instrumentalness  float64 
 18  liveness          float64 
 19  valence           float64 
 20  tempo             float64 
 21  duration_ms       float64 
 22  time_signature    float64 
dtypes: category(4), float64(13), uint32(5), uint8(1)
memory usage: 5.1 GB


Unnamed: 0,Age,Education Level,Artist,Album,Track,User Id,Artist Id,Album Id,Track Id,Event Unixtime,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,16,HS,I BREAK HORSES,CHIAROSCURO,FAITH,15397460,15610,33142,99816,1391098195,...,,,,,,,,,,
1,16,HS,TAYLOR SWIFT,FEARLESS,FEARLESS,15397460,3744,30727,99833,1374858428,...,,,,,,,,,,
2,16,HS,HIM,TEARS ON TAPE,W.L.S.T.D.,15397460,10028,33156,99851,1369547268,...,-5.619,1.0,0.0384,0.0148,0.154,0.368,0.112,134.019,252427.0,4.0
3,16,HS,AMARANTHE,THE NEXUS,INFINITY,15397460,104,33147,99875,1365511561,...,-4.815,0.0,0.174,6.6e-05,0.0,0.105,0.407,180.104,185400.0,4.0
4,16,HS,AMARANTHE,THE NEXUS,INVINCIBLE,15397460,104,33147,99823,1365509450,...,-4.482,0.0,0.228,1.8e-05,0.0,0.333,0.552,139.956,191200.0,4.0


## Saving the Enriched Music Data

In [17]:
musicEnriched = SaveData(musicEnriched, 'KidListeningEventsWithAudioFeatures')
musicEnriched.head(5)

Unnamed: 0,Age,Education Level,Artist,Album,Track,User Id,Artist Id,Album Id,Track Id,Event Unixtime,...,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration Ms,Time Signature
0,16,HS,I BREAK HORSES,CHIAROSCURO,FAITH,15397460,15610,33142,99816,1391098195,...,,,,,,,,,,
1,16,HS,TAYLOR SWIFT,FEARLESS,FEARLESS,15397460,3744,30727,99833,1374858428,...,,,,,,,,,,
2,16,HS,HIM,TEARS ON TAPE,W.L.S.T.D.,15397460,10028,33156,99851,1369547268,...,-5.619,1.0,0.0384,0.0148,0.154,0.368,0.112,134.019,252427.0,4.0
3,16,HS,AMARANTHE,THE NEXUS,INFINITY,15397460,104,33147,99875,1365511561,...,-4.815,0.0,0.174,6.6e-05,0.0,0.105,0.407,180.104,185400.0,4.0
4,16,HS,AMARANTHE,THE NEXUS,INVINCIBLE,15397460,104,33147,99823,1365509450,...,-4.482,0.0,0.228,1.8e-05,0.0,0.333,0.552,139.956,191200.0,4.0
