### Importing required libraries and Join Data

I need to join all the json files that hold my personal Spotify listening history. This data can be requested from Spotify and they will send it to you within 20-30 days.

In [1]:
import pandas as pd
import os
import json
import requests
import glob
import pytz

In [2]:
directory = r'/Users/DataAnalyst/Desktop/Spotify Data'
json_pattern = os.path.join(directory, '*.json')
file_list = glob.glob(json_pattern)

dfs = []
for file in file_list:
    with open(file) as f:
        json_data = pd.json_normalize(json.loads(f.read()))
    dfs.append(json_data)
df = pd.concat(dfs)

In [3]:
df.shape

(289116, 21)

In [4]:
df.head()

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,...,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2022-09-11T06:11:50Z,21m2pxzlxap2wdp2fedgl2sva,"iOS 15.6 (iPhone12,1)",2600,AU,49.185.171.212,unknown,Crazy Rap (Colt 45 & 2 Zig Zags),Afroman,The Good Times,...,,,,fwdbtn,fwdbtn,True,,False,1662876706854,False
1,2018-02-13T12:17:23Z,21m2pxzlxap2wdp2fedgl2sva,"iOS 9.3.2 (iPhone7,2)",2066,AU,110.174.101.203,unknown,Meet Me in the Hallway,Harry Styles,Harry Styles,...,,,,fwdbtn,fwdbtn,False,,False,1518524240589,False
2,2021-02-09T03:07:57Z,21m2pxzlxap2wdp2fedgl2sva,"iOS 14.0 (iPhone12,1)",1002,AU,49.184.69.137,unknown,Lies - Remastered,The Rolling Stones,Some Girls,...,,,,fwdbtn,fwdbtn,True,,False,1612840075329,False
3,2020-10-14T03:56:14Z,21m2pxzlxap2wdp2fedgl2sva,"iOS 14.0 (iPhone12,1)",111317,AU,59.102.23.219,unknown,Insane,Flume,Flume,...,,,,fwdbtn,fwdbtn,True,,False,1602647662251,False
4,2022-08-08T22:32:47Z,21m2pxzlxap2wdp2fedgl2sva,"iOS 15.6 (iPhone12,1)",171024,AU,49.185.103.252,unknown,Give It To Me Baby,Rick James,Street Songs,...,,,,trackdone,fwdbtn,True,,False,1659997794638,False


### Dropping irrelevant columns

In [5]:
df.columns

Index(['ts', 'username', 'platform', 'ms_played', 'conn_country',
       'ip_addr_decrypted', 'user_agent_decrypted',
       'master_metadata_track_name', 'master_metadata_album_artist_name',
       'master_metadata_album_album_name', 'spotify_track_uri', 'episode_name',
       'episode_show_name', 'spotify_episode_uri', 'reason_start',
       'reason_end', 'shuffle', 'skipped', 'offline', 'offline_timestamp',
       'incognito_mode'],
      dtype='object')

In [6]:
columns_to_drop = ['username', 
 'ip_addr_decrypted', 
 'user_agent_decrypted', 
 'episode_name',
 'episode_show_name',
 'spotify_episode_uri',
 'offline',
 'offline_timestamp',
 'incognito_mode',
 'skipped',
 'shuffle'
 ]
df = df.drop(columns=columns_to_drop)

### Updating the column names 

In [7]:
column_list = []
for column in df.columns:
    column_list.append(f"'{column}':'',")
print('\n'.join(column_list))

'ts':'',
'platform':'',
'ms_played':'',
'conn_country':'',
'master_metadata_track_name':'',
'master_metadata_album_artist_name':'',
'master_metadata_album_album_name':'',
'spotify_track_uri':'',
'reason_start':'',
'reason_end':'',


In [8]:
updated_col_names = {
'ts':'timestamp',
'ms_played':'duration_ms',
'conn_country':'country_played',
'master_metadata_track_name':'track_name',
'master_metadata_album_artist_name':'artist_name',
'master_metadata_album_album_name':'album_name',
'spotify_track_uri':'track_uri',  
}
df = df.rename(columns=updated_col_names)

In [9]:
df.head(5)

Unnamed: 0,timestamp,platform,duration_ms,country_played,track_name,artist_name,album_name,track_uri,reason_start,reason_end
0,2022-09-11T06:11:50Z,"iOS 15.6 (iPhone12,1)",2600,AU,Crazy Rap (Colt 45 & 2 Zig Zags),Afroman,The Good Times,spotify:track:1ACZpHI5vZ5Ea4xGlkdGWM,fwdbtn,fwdbtn
1,2018-02-13T12:17:23Z,"iOS 9.3.2 (iPhone7,2)",2066,AU,Meet Me in the Hallway,Harry Styles,Harry Styles,spotify:track:4u9Y4Z9i9VaAbBK7rW1Rha,fwdbtn,fwdbtn
2,2021-02-09T03:07:57Z,"iOS 14.0 (iPhone12,1)",1002,AU,Lies - Remastered,The Rolling Stones,Some Girls,spotify:track:7hDQerpfE5FgYtVKEVemwl,fwdbtn,fwdbtn
3,2020-10-14T03:56:14Z,"iOS 14.0 (iPhone12,1)",111317,AU,Insane,Flume,Flume,spotify:track:3SJelKECIakrYtG0YU24Ow,fwdbtn,fwdbtn
4,2022-08-08T22:32:47Z,"iOS 15.6 (iPhone12,1)",171024,AU,Give It To Me Baby,Rick James,Street Songs,spotify:track:13v3siPyvy5TTEZYmGPPse,trackdone,fwdbtn


In [10]:
df.dtypes

timestamp         object
platform          object
duration_ms        int64
country_played    object
track_name        object
artist_name       object
album_name        object
track_uri         object
reason_start      object
reason_end        object
dtype: object

### Filter for desired Years & Handle Empty Values

As I am doing this analysis for 2019 - 2022 I need to filter out the years.

I need to create a new row for the Year based off the timestamp (I will breakdown the timestamp futher later on
once I handle the timezone conversions).

In [11]:
# Converting the timestamp column to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
df['year'] = df['timestamp'].dt.year
years_to_keep = [2019, 2020, 2021, 2022]
df = df[df['year'].isin(years_to_keep)]

In [12]:
df.isnull().sum()

timestamp            0
platform             0
duration_ms          0
country_played       0
track_name        1934
artist_name       1934
album_name        1934
track_uri         1934
reason_start         0
reason_end           0
year                 0
dtype: int64

In [13]:
nan_rows = df['track_name'].isnull()
nan_rows.head(10)

0     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
Name: track_name, dtype: bool

These were the rows where I had podcast data. I can drop these rows as I am only doing analysis on songs

In [14]:
df.dropna(subset=['track_name'], inplace=True)
df.isnull().sum()

timestamp         0
platform          0
duration_ms       0
country_played    0
track_name        0
artist_name       0
album_name        0
track_uri         0
reason_start      0
reason_end        0
year              0
dtype: int64

### Clean track_uri column

I need to clean the rows so that it just the track uri value, this will be used later on when I pull data from spotify's API to give me more data on the tracks

In [15]:
df['uri'] = df['track_uri'].str.split(':',2).str[2]
df.drop('track_uri', axis=1,inplace=True)

### Clean Platform Type Column

In [16]:
unique_platforms = df['platform'].unique()
print('\n'.join(unique_platforms))

iOS 15.6 (iPhone12,1)
iOS 14.0 (iPhone12,1)
iOS 13.3.1 (iPhone12,1)
iOS 14.4.1 (iPhone12,1)
OS X 11.4.0 [arm 2]
iOS 11.4 (iPhone9,3)
ios
iOS 13.4.1 (iPhone12,1)
iOS 16.0 (iPhone15,2)
OS X 11.4.0 [x86 4]
iOS 13.5.1 (iPhone12,1)
iOS 13.6.1 (iPhone12,1)
OS X 10.13.3 [x86 8]
iOS 13.7 (iPhone12,1)
OS X 11.2.3 [x86 4]
Windows 10 (10.0.19042; x64)
OS X 10.13.6 [x86 8]
windows
iOS 13.6 (iPhone12,1)
osx
Partner ios_sdk Apple;iPhone9.3;1b41c83f9d3f4f61b805277798862a85;11.4
web_player osx 10.13.3;chrome 86.0.4240.111;desktop
OS X 10.16.0 [x86 4]
Partner ios_sdk Apple;iPhone9.3;1bfd2066b99e4ddbb6c28a761e96c4f9;11.4
web_player osx 10.13.3;chrome 78.0.3904.97;desktop
web_player osx 10.13.3;chrome 70.0.3538.102;desktop
web_player osx 10.13.3;chrome 73.0.3683.86;desktop
web_player ios 14.0;safari 14.0;mobile
Partner Tiny StreamUnlimited;DRX-5;;
web_player osx 10.13.3;chrome 74.0.3729.169;embed


In [17]:
unique_counts = df['platform'].value_counts()
print(unique_counts)

iOS 11.4 (iPhone9,3)                                                     79390
iOS 14.4.1 (iPhone12,1)                                                  60569
iOS 14.0 (iPhone12,1)                                                    27056
iOS 13.3.1 (iPhone12,1)                                                  11187
OS X 10.13.3 [x86 8]                                                     11048
iOS 15.6 (iPhone12,1)                                                    10009
OS X 11.4.0 [arm 2]                                                       9236
ios                                                                       7956
iOS 13.5.1 (iPhone12,1)                                                   4401
iOS 13.4.1 (iPhone12,1)                                                   4314
OS X 11.4.0 [x86 4]                                                       4280
iOS 16.0 (iPhone15,2)                                                     3694
OS X 10.13.6 [x86 8]                                

In [18]:
df.loc[df['platform'].str.contains('iOS|ios'), 'platform'] = 'iPhone'
df.loc[~df['platform'].str.contains('iPhone'), 'platform'] = 'Computer'
unique_counts = df['platform'].value_counts()
print(unique_counts)

iPhone      213330
Computer     29595
Name: platform, dtype: int64


In [19]:
unique_counts = df['platform'].value_counts()
percentage = unique_counts / unique_counts.sum() * 100
percentage_formatted = percentage.apply(lambda x: f"{x:.1f}%")
print(percentage_formatted)

iPhone      87.8%
Computer    12.2%
Name: platform, dtype: object


In [20]:
df.rename(columns={'platform': 'platform_played_on'}, inplace=True)

### Filtereting out songs skipped

I often skip songs after a few seconds of it coming on shuffle. For the purpose of this anlysis I will assume any track duration of less than one minute will not count as a stream. This is a fair assumption as it is very uncommon for a song to have a total duration of less than one minute (After looking through all 3,200 liked songs on my Spotify I only had a duration of less than one minute - it was the 42 second song - The Lovley Linda by Paul McCartney. So this song will be excluded from the data set (Sorry Paul!)

First, I need to format the duration_ms column into minutes, then drops all rows with a duration > 1 minute

In [21]:
df['duration_listened'] = (df['duration_ms'] / 60000)
df = df.drop(columns='duration_ms')

In [22]:
less_than_1_min = (df['duration_listened'] < 1.00).sum()
greater_than_1_min = (df['duration_listened'] > 1.00).sum()

print("Counts of rows less than 1 minute:", less_than_1_min)
print("Counts of rows greater than 1 minute:", greater_than_1_min)

Counts of rows less than 1 minute: 153912
Counts of rows greater than 1 minute: 89013


In [23]:
df = df[df['duration_listened'] >= 1.00]
df.reset_index(drop=True, inplace=True)
df.head(5)

Unnamed: 0,timestamp,platform_played_on,country_played,track_name,artist_name,album_name,reason_start,reason_end,year,uri,duration_listened
0,2020-10-14 03:56:14+00:00,iPhone,AU,Insane,Flume,Flume,fwdbtn,fwdbtn,2020,3SJelKECIakrYtG0YU24Ow,1.855283
1,2022-08-08 22:32:47+00:00,iPhone,AU,Give It To Me Baby,Rick James,Street Songs,trackdone,fwdbtn,2022,13v3siPyvy5TTEZYmGPPse,2.8504
2,2022-01-11 21:25:51+00:00,Computer,US,Texas Sun,Khruangbin,Texas Sun,trackdone,trackdone,2022,24ntSW3QVJzR79lHAAOTaY,4.213517
3,2019-09-24 11:45:06+00:00,iPhone,AU,Ride,Twenty One Pilots,Blurryface,backbtn,trackdone,2019,2Z8WuEywRWYTKe1NybPQEW,3.5751
4,2022-12-28 16:47:39+00:00,iPhone,ES,Because - Remastered 2009,The Beatles,Abbey Road,trackdone,trackdone,2022,1rxoyGj1QuPoVi8fOft1Kt,2.7611


### Breakding down/Converting the timestamp

Because I want to do analysis on listens by the hour of the day, I need to ensure I convert the timestamps into the appropaite timezone based on where I played the track. In the data provided by Spotify it provides a timestamp in the UTC timezone so I must convert this the relevant country timezone the song was streamed in.

First, I will breakdown the timestamp into Year, so I can see the percentage of how many songs I played in each country

In [24]:
grouped_counts = df.groupby('country_played').size()

percentage_counts = grouped_counts.div(grouped_counts.sum()) * 100
percentage_formatted = percentage_counts.apply(lambda x: f"{x:.1f}%")
print(percentage_formatted)

country_played
AE     0.1%
AU    90.9%
BE     0.0%
DE     0.1%
ES     0.2%
FR     0.2%
GB     0.6%
HK     0.1%
HU     0.1%
ID     1.2%
JP     0.4%
NL     0.0%
PT     0.1%
SE     0.1%
US     5.8%
ZZ     0.0%
dtype: object


Then I converted the timestamps to the appropriate timezone based on the country the track was played in

In [25]:
unique_countries = df['country_played'].unique()
country_list = []
for country in unique_countries:
    country_list.append(f"'{country}':'',")
print('\n'.join(country_list))

'AU':'',
'US':'',
'ES':'',
'ID':'',
'GB':'',
'JP':'',
'HK':'',
'FR':'',
'HU':'',
'SE':'',
'PT':'',
'NL':'',
'AE':'',
'BE':'',
'DE':'',
'ZZ':'',


In [26]:
# copy pasting the outfrom from above to save time, I then mapped each country code to its respective timezone
country_timezones = {
    'AU': 'Australia/Melbourne',
    'US': 'America/New_York',
    'ES': 'Europe/Madrid',
    'ID': 'Asia/Jakarta',
    'GB': 'Europe/London',
    'JP': 'Asia/Tokyo',
    'HK': 'Asia/Hong_Kong',
    'FR': 'Europe/Paris',
    'HU': 'Europe/Budapest',
    'SE': 'Europe/Stockholm',
    'PT': 'Europe/Lisbon',
    'NL': 'Europe/Amsterdam',
    'AE': 'Asia/Dubai',
    'BE': 'Europe/Brussels',
    'DE': 'Europe/Berlin',
    'ZZ': 'UTC'
}

df['timestamp'] = df.apply(lambda row: row['timestamp'].astimezone(pytz.timezone(country_timezones.get(row['country_played']))), axis=1)
df.head(5)


Unnamed: 0,timestamp,platform_played_on,country_played,track_name,artist_name,album_name,reason_start,reason_end,year,uri,duration_listened
0,2020-10-14 14:56:14+11:00,iPhone,AU,Insane,Flume,Flume,fwdbtn,fwdbtn,2020,3SJelKECIakrYtG0YU24Ow,1.855283
1,2022-08-09 08:32:47+10:00,iPhone,AU,Give It To Me Baby,Rick James,Street Songs,trackdone,fwdbtn,2022,13v3siPyvy5TTEZYmGPPse,2.8504
2,2022-01-11 16:25:51-05:00,Computer,US,Texas Sun,Khruangbin,Texas Sun,trackdone,trackdone,2022,24ntSW3QVJzR79lHAAOTaY,4.213517
3,2019-09-24 21:45:06+10:00,iPhone,AU,Ride,Twenty One Pilots,Blurryface,backbtn,trackdone,2019,2Z8WuEywRWYTKe1NybPQEW,3.5751
4,2022-12-28 17:47:39+01:00,iPhone,ES,Because - Remastered 2009,The Beatles,Abbey Road,trackdone,trackdone,2022,1rxoyGj1QuPoVi8fOft1Kt,2.7611


In [27]:
df.dtypes

timestamp              object
platform_played_on     object
country_played         object
track_name             object
artist_name            object
album_name             object
reason_start           object
reason_end             object
year                    int64
uri                    object
duration_listened     float64
dtype: object

The timestamp values were converted to the correct timezone, but when I converted them to datetime they went back to the UTC timezone. To deal with this is converted the data type to string and then pulled my desired values for the year, month, day and hour columns using string indexing.

In [28]:
df['timestamp'] = df['timestamp'].astype(str)

df['year'] = df['timestamp'].str[:4]
df['month'] = df['timestamp'].str[5:7]
df['day'] = df['timestamp'].str[8:10]
df['hour'] = df['timestamp'].str[11:13]
df.head(5)

Unnamed: 0,timestamp,platform_played_on,country_played,track_name,artist_name,album_name,reason_start,reason_end,year,uri,duration_listened,month,day,hour
0,2020-10-14 14:56:14+11:00,iPhone,AU,Insane,Flume,Flume,fwdbtn,fwdbtn,2020,3SJelKECIakrYtG0YU24Ow,1.855283,10,14,14
1,2022-08-09 08:32:47+10:00,iPhone,AU,Give It To Me Baby,Rick James,Street Songs,trackdone,fwdbtn,2022,13v3siPyvy5TTEZYmGPPse,2.8504,8,9,8
2,2022-01-11 16:25:51-05:00,Computer,US,Texas Sun,Khruangbin,Texas Sun,trackdone,trackdone,2022,24ntSW3QVJzR79lHAAOTaY,4.213517,1,11,16
3,2019-09-24 21:45:06+10:00,iPhone,AU,Ride,Twenty One Pilots,Blurryface,backbtn,trackdone,2019,2Z8WuEywRWYTKe1NybPQEW,3.5751,9,24,21
4,2022-12-28 17:47:39+01:00,iPhone,ES,Because - Remastered 2009,The Beatles,Abbey Road,trackdone,trackdone,2022,1rxoyGj1QuPoVi8fOft1Kt,2.7611,12,28,17


### Moving year column & removing timestamp column

In [29]:
year_column = df.pop('year')
df.insert(df.columns.get_loc('duration_listened') + 1, 'year', year_column)
df = df.drop(columns=['timestamp'])
df.rename_axis('index', inplace=True)
df.head(5)

Unnamed: 0_level_0,platform_played_on,country_played,track_name,artist_name,album_name,reason_start,reason_end,uri,duration_listened,year,month,day,hour
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,iPhone,AU,Insane,Flume,Flume,fwdbtn,fwdbtn,3SJelKECIakrYtG0YU24Ow,1.855283,2020,10,14,14
1,iPhone,AU,Give It To Me Baby,Rick James,Street Songs,trackdone,fwdbtn,13v3siPyvy5TTEZYmGPPse,2.8504,2022,8,9,8
2,Computer,US,Texas Sun,Khruangbin,Texas Sun,trackdone,trackdone,24ntSW3QVJzR79lHAAOTaY,4.213517,2022,1,11,16
3,iPhone,AU,Ride,Twenty One Pilots,Blurryface,backbtn,trackdone,2Z8WuEywRWYTKe1NybPQEW,3.5751,2019,9,24,21
4,iPhone,ES,Because - Remastered 2009,The Beatles,Abbey Road,trackdone,trackdone,1rxoyGj1QuPoVi8fOft1Kt,2.7611,2022,12,28,17


### Saving my dataset

In [30]:
file_path = "/Users/DataAnalyst/Desktop/Spotify Listening History 2019 - 2022.csv"
df.to_csv(file_path, index=True)

### Create new datafram for track URI's

In [30]:
unique_uris = df['uri'].unique()
df_uris = pd.DataFrame({'uri': unique_uris})

In [31]:
df_uris.shape

(8397, 1)

### Gathering additional data ultilizing Spotify's API

I need to using Spotify's API to gather additional data on audo features, song duration and release date which I will use in my dashboard

In [32]:
url = 'https://accounts.spotify.com/api/token'

auth_response = requests.post(url, {
    'grant_type': 'client_credentials',
    'client_id': 'client_id_number',
    'client_secret': 'client_secret_number',
})

auth_response_data = auth_response.json()

access_token = auth_response_data['access_token']

In [33]:
headers = {'Authorization': 'Bearer {token}'.format(token=access_token)}

In [34]:
base_url = 'https://api.spotify.com/v1/'

In [36]:
def get_audio_features(uri):
    url = f'{base_url}audio-features/{uri}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        audio_features = response.json()
        selected_features = {feature: audio_features[feature] for feature in ['acousticness', 
                                                                              'danceability', 
                                                                              'energy', 
                                                                              'instrumentalness',
                                                                              'key', 
                                                                              'liveness', 
                                                                              'loudness', 
                                                                              'mode',
                                                                              'speechiness',
                                                                              'tempo', 
                                                                              'time_signature',
                                                                              'valence']}
        return selected_features
    else:
        print(f"Error getting audio features for URI '{uri}': {response.content}")
        return None

# A dictionary to store the audio features for each uri
audio_features_dict = {}

# Iterate over each uri in the df_uris
for uri in df_uris['uri']:
    audio_features = get_audio_features(uri)
    if audio_features:
        audio_features_dict[uri] = audio_features
        
# I need to convert dictionary into dataframe with uri as the first column
audio_features_dict = pd.DataFrame.from_dict(audio_features_dict, orient='index')
audio_features_dict.insert(0, 'uri', audio_features_dict.index)
audio_features_dict.reset_index(inplace=True, drop=True)

audio_features_dict.head(5)


Error getting audio features for URI '7chFaRyJHV6q3kZdZpsYO3': b'{\n  "error" : {\n    "status" : 404,\n    "message" : "analysis not found"\n  }\n}'
Error getting audio features for URI '6MFQeWtk7kxWGydnJB2y36': b'{\n  "error" : {\n    "status" : 404,\n    "message" : "analysis not found"\n  }\n}'


Unnamed: 0,uri,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,3SJelKECIakrYtG0YU24Ow,0.0507,0.487,0.535,6e-05,5,0.121,-8.257,0,0.0328,93.984,4,0.0485
1,13v3siPyvy5TTEZYmGPPse,0.114,0.706,0.727,7.7e-05,11,0.174,-8.714,1,0.0456,119.55,4,0.965
2,24ntSW3QVJzR79lHAAOTaY,0.0364,0.636,0.634,0.147,7,0.0998,-10.01,1,0.0342,111.563,4,0.599
3,2Z8WuEywRWYTKe1NybPQEW,0.00835,0.645,0.713,0.0,6,0.113,-5.355,1,0.0393,74.989,4,0.566
4,1rxoyGj1QuPoVi8fOft1Kt,0.754,0.328,0.249,6e-06,2,0.134,-11.777,1,0.0272,81.257,4,0.162


### EDA & Cleaning of Additional Dataset

In [37]:
audio_features_dict.isnull().sum()

uri                 0
acousticness        0
danceability        0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64

In [38]:
audio_features_dict.head(5)

Unnamed: 0,uri,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,3SJelKECIakrYtG0YU24Ow,0.0507,0.487,0.535,6e-05,5,0.121,-8.257,0,0.0328,93.984,4,0.0485
1,13v3siPyvy5TTEZYmGPPse,0.114,0.706,0.727,7.7e-05,11,0.174,-8.714,1,0.0456,119.55,4,0.965
2,24ntSW3QVJzR79lHAAOTaY,0.0364,0.636,0.634,0.147,7,0.0998,-10.01,1,0.0342,111.563,4,0.599
3,2Z8WuEywRWYTKe1NybPQEW,0.00835,0.645,0.713,0.0,6,0.113,-5.355,1,0.0393,74.989,4,0.566
4,1rxoyGj1QuPoVi8fOft1Kt,0.754,0.328,0.249,6e-06,2,0.134,-11.777,1,0.0272,81.257,4,0.162


In [39]:
audio_features_dict.dtypes

uri                  object
acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
time_signature        int64
valence             float64
dtype: object

I only want to assign one Key to the key value, so when there are enharmonic equivalents such as in key value 1, I will for the purpose of my analysis just use the sharp keys

pitch_class_dict = {
    0: 'C',
    1: 'C♯, D♭',
    2: 'D',
    3: 'D♯, E♭',
    4: 'E',
    5: 'F',
    6: 'F♯, G♭',
    7: 'G',
    8: 'G♯, A♭',
    9: 'A',
    10: 'A♯, B♭',
    11: 'B'
}

In [40]:
pitch_class_dict = {
    0: 'C',
    1: 'C♯',
    2: 'D',
    3: 'D♯',
    4: 'E',
    5: 'F',
    6: 'F♯',
    7: 'G',
    8: 'G♯',
    9: 'A',
    10: 'A♯',
    11: 'B'
}

audio_features_dict['key'] = audio_features_dict['key'].map(pitch_class_dict)

The time_signature column represents the notational convention to specify how many beats are in each bar.  The time signature ranges from 3 to 7 indicating time signatures of "3/4", to "7/4". I want to change these numercial values to the actual time signatures eg 4 = 4/4 for ease of readability.

In [41]:
time_signature_dict = {
    3: '3/4',
    4: '4/4',
    5: '5/4',
    6: '6/4',
    7: '7/4'
}
audio_features_dict['time_signature'] = audio_features_dict['time_signature'].map(time_signature_dict)

The mode column indicates the modality (major or minor) of a track and in the data from Spotify major is represented by 1 and minor is 0. I want to change the numercial encoding to the catergory names. 

In [42]:
audio_features_dict['mode'] = audio_features_dict['mode'].replace({0:'minor', 1:'major'})

In [43]:
audio_features_dict.head(5)

Unnamed: 0,uri,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,3SJelKECIakrYtG0YU24Ow,0.0507,0.487,0.535,6e-05,F,0.121,-8.257,minor,0.0328,93.984,4/4,0.0485
1,13v3siPyvy5TTEZYmGPPse,0.114,0.706,0.727,7.7e-05,B,0.174,-8.714,major,0.0456,119.55,4/4,0.965
2,24ntSW3QVJzR79lHAAOTaY,0.0364,0.636,0.634,0.147,G,0.0998,-10.01,major,0.0342,111.563,4/4,0.599
3,2Z8WuEywRWYTKe1NybPQEW,0.00835,0.645,0.713,0.0,F♯,0.113,-5.355,major,0.0393,74.989,4/4,0.566
4,1rxoyGj1QuPoVi8fOft1Kt,0.754,0.328,0.249,6e-06,D,0.134,-11.777,major,0.0272,81.257,4/4,0.162


### Saving the Audio Features Dataframe

In [None]:
# Specify the file path and name for the CSV file
file_path = "/Users/DataAnalyst/Desktop/track_audio_features.csv"

# Save the DataFrame as a CSV file
audio_features_dict.to_csv(file_path, index=False)

In [None]:
audio_features_dict.shape

In [45]:
audio_features_dict.dtypes

uri                  object
acousticness        float64
danceability        float64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
dtype: object

### Using Spotify's API to get dataframe for song duration & release date

In [49]:
def get_track_info(uri):
    url = f'{base_url}tracks/{uri}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        track_info = response.json()
        selected_info = {'duration_ms': track_info['duration_ms'],
                        'release_date': track_info['album']['release_date']}
        return selected_info
    else:
        print(f"Error getting track_info for URI '{uri}': {response.content, response.status_code}")
        return None

# Dictionary to store the selected audio features for each URI
track_info_dict = {}

# Loop to iterate over each URI in the df_uris DataFrame
for uri in df_uris['uri']:
    track_info = get_track_info(uri)
    if track_info:
        track_info_dict[uri] = track_info
    
# convert dictionary into dataframe with uri as the first column
track_info_dict = pd.DataFrame.from_dict(track_info_dict, orient='index')
track_info_dict.insert(0, 'uri', track_info_dict.index)
track_info_dict.reset_index(inplace=True, drop=True)

track_info_dict.head(20)

Unnamed: 0,uri,duration_ms,release_date
0,3SJelKECIakrYtG0YU24Ow,213883,2012-11-09
1,13v3siPyvy5TTEZYmGPPse,248133,1981-04-07
2,24ntSW3QVJzR79lHAAOTaY,252811,2020-02-07
3,2Z8WuEywRWYTKe1NybPQEW,214506,2015-05-15
4,1rxoyGj1QuPoVi8fOft1Kt,165666,1969-09-26
5,4ceIM3gqPDi0hUhaM3xCDw,264595,2018-04-27
6,29HSkfe5ITejb0MXhroHtG,248590,2017-09-29
7,0GN3fXUdsTHeUg50xfDS0V,270918,2017-05-05
8,0gEaeqVRHPzRc7HMXtOKc7,685880,1977-01-23
9,7d1GPc45c9wtmYxNSwLXTy,284000,2011-01-11


In [None]:
file_path = "/Users/DataAnalyst/Desktop/track_year_duration.csv"

track_info_dict.to_csv(file_path, index=False)

### Using Spotify's API to get dataframe for external URLs 

these URLs will be used as embedded links in my dashboard to play previews of the selected songs.

In [35]:
def get_link_info(uri):
    url = f'{base_url}tracks/{uri}'
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        link_info = response.json()
        link_info = {'Spotify URL': link_info['external_urls']}
        return link_info
    else:
        print(f"Error getting track_info for URI '{uri}': {response.content, response.status_code}")
        return None

# Dictionary to store the selected audio features for each URI
link_info_dict = {}

# Loop to iterate over each URI in the df_uris DataFrame
for uri in df_uris['uri']:
    link_info = get_link_info(uri)
    if link_info:
        link_info_dict[uri] = link_info
    
# Convert dictionary into dataframe with uri as the first column
link_info_dict = pd.DataFrame.from_dict(link_info_dict, orient='index')
link_info_dict.insert(0, 'uri', link_info_dict.index)
link_info_dict.reset_index(inplace=True, drop=True)

link_info_dict.head(20)

Unnamed: 0,uri,Spotify URL
0,003drELkl4KdItemQ8HyAX,{'spotify': 'https://open.spotify.com/track/00...
1,009G971wsvioMi84zENJax,{'spotify': 'https://open.spotify.com/track/00...
2,00GoNYdTeQrKuuluGlL60Q,{'spotify': 'https://open.spotify.com/track/00...
3,00OIc0La1fUi1jQ0fjJvYZ,{'spotify': 'https://open.spotify.com/track/00...
4,00P2EyY2Ygge1Azu5Rx8lm,{'spotify': 'https://open.spotify.com/track/00...
5,00RhMZwmPZEZY3bbFV5VJK,{'spotify': 'https://open.spotify.com/track/00...
6,00VbFhxSgbyf0AReigRWX0,{'spotify': 'https://open.spotify.com/track/00...
7,00f4k37anxSU9cspbLj0xe,{'spotify': 'https://open.spotify.com/track/00...
8,00gLisxOrLruNwJFdpkHc2,{'spotify': 'https://open.spotify.com/track/00...
9,00k9dNOfAYx7009RPx31B6,{'spotify': 'https://open.spotify.com/track/00...


### Merging the dataframes

In [36]:
track_info_dict = pd.read_csv(r'/Users/DataAnalyst/Desktop/Spotify Data For Tableau/track_info(Year & Duration).csv')

In [37]:
merged_df = pd.merge(track_info_dict, link_info_dict, on='uri', how='inner')
merged_df.rename(columns={'Spotify URL': 'url'}, inplace=True)
merged_df['url'] = merged_df['url'].astype(str)
merged_df['url'] = merged_df['url'].str.extract(r'(h.*).{2}$')
merged_df['url'] = merged_df['url'].str.replace('com', 'com/embed')
pd.set_option('display.max_colwidth', None)
merged_df.head(5)

Unnamed: 0,uri,duration_ms,release_date,url
0,3SJelKECIakrYtG0YU24Ow,213883,2012-11-09,https://open.spotify.com/embed/track/3SJelKECIakrYtG0YU24Ow
1,13v3siPyvy5TTEZYmGPPse,248133,1981-04-07,https://open.spotify.com/embed/track/13v3siPyvy5TTEZYmGPPse
2,24ntSW3QVJzR79lHAAOTaY,252811,2020-02-07,https://open.spotify.com/embed/track/24ntSW3QVJzR79lHAAOTaY
3,2Z8WuEywRWYTKe1NybPQEW,214506,2015-05-15,https://open.spotify.com/embed/track/2Z8WuEywRWYTKe1NybPQEW
4,1rxoyGj1QuPoVi8fOft1Kt,165666,1969-09-26,https://open.spotify.com/embed/track/1rxoyGj1QuPoVi8fOft1Kt


In [38]:
file_path = "/Users/DataAnalyst/Desktop/track_info_url.csv"

merged_df.to_csv(file_path, index=False)

In [None]:
merged_df = df.merge(audio_features_dict, on='uri')

I needed to create a column with the Track Name & Artist name as the uri is different for the same tracks that are released as a single vs the album version. So this new column will be used as the unique identifier instead of the uri column.

In [None]:
merged_df['track_artist_name'] = merged_df['track_name'] + "--" + merged_df['artist_name']

In [None]:
df_new = pd.read_csv(r'/Users/DataAnalyst/Desktop/Spotify Data For Tableau/Spotify Listening History 2019 - 2022.csv')                                 

In [None]:
df_new['track_artist_name'] = df_new['track_name'] + "--" + df_new['artist_name']

In [None]:
df_new.to_csv(r'/Users/DataAnalyst/Desktop/Spotify Data For Tableau/Spotify Listening History 2019 - 2022.csv', index=False)

### Adjusting Audio Features Data

For each unique identifier (Track Name & Artist Name) I need to filter out a unique value for each audio feature, as when creating the dashboard I found that the Spotify algorithm had assigned multiple values for certain audio features for the same song eg. Gvae 2 tempo values for the same Track Name & Artist Name. 

The code belowe assigns the orginial value of the audio feature if there is only one unique value. Otherwise, it assigns the minimum audio feature value for each Track Name & Artist Name.

In [None]:
# calculated the minimum audio feature for each unique track_artist_name
min_acousticness_by_artist = merged_df.groupby('track_artist_name')['acousticness'].transform('min')

# used boolean indexing to set adjusted column based on track_artist_name uniqueness
merged_df['Acousticness_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                              merged_df['acousticness'],
                                              min_acousticness_by_artist)

min_speechiness_by_artist = merged_df.groupby('track_artist_name')['speechiness'].transform('min')

merged_df['Speechiness_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                             merged_df['speechiness'],
                                             min_speechiness_by_artist)

min_liveness_by_artist = merged_df.groupby('track_artist_name')['liveness'].transform('min')

merged_df['Liveness_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                          merged_df['liveness'],
                                          min_liveness_by_artist)

min_energy_by_artist = merged_df.groupby('track_artist_name')['energy'].transform('min')

merged_df['Energy_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                        merged_df['energy'],
                                        min_energy_by_artist)

min_danceability_by_artist = merged_df.groupby('track_artist_name')['danceability'].transform('min')

merged_df['Danceability_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                              merged_df['danceability'],
                                              min_danceability_by_artist)

min_valence_by_artist = merged_df.groupby('track_artist_name')['valence'].transform('min')

merged_df['Valence_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                         merged_df['valence'],
                                         min_valence_by_artist)

min_instrumentalness_by_artist = merged_df.groupby('track_artist_name')['instrumentalness'].transform('min')

merged_df['Instrumentalness_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                         merged_df['instrumentalness'],
                                         min_instrumentalness_by_artist)

min_tempo_by_artist = merged_df.groupby('track_artist_name')['tempo'].transform('min')

merged_df['Tempo_Adjusted'] = np.where(merged_df['track_artist_name'].nunique() < 2,
                                         merged_df['tempo'],
                                         min_tempo_by_artist)

Now I need to drop the old columns

In [None]:
merged_df=merged_df.drop(columns=['acousticness',
                                  'speechiness',
                                  'liveness',
                                  'energy',
                                  'danceability', 
                                  'valence', 
                                  'instrumentalness',
                                  'tempo'])

Now I can drop the columns I do not need, as I will be joining these datasets in Tableau with the unique identifier track_artist_name column

In [None]:
column_number = merged_df.columns.get_loc('key')
merged_df = merged_df.iloc[:, column_number:]

In [None]:
merged_df = merged_df.drop_duplicates(subset='track_artist_name', keep='first')

Now I can save this dataset. I will join all the the saved databases from this code using relationships in the Tableau data model.

In [None]:
file_path = "/Users/DataAnalyst/Desktop/merged_audio_features.csv"

merged_df.to_csv(file_path, index=False)