In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
spotify_data = pd.read_csv('data/spotifyAnalysis.csv', encoding = 'latin1')

# Display data types inferred by pandas
print(spotify_data.head())


                             song_name       artist_name  artist_count  \
0  Seven (feat. Latto) (Explicit Ver.)  Latto, Jung Kook             2   
1                                 LALA       Myke Towers             1   
2                              vampire    Olivia Rodrigo             1   
3                         Cruel Summer      Taylor Swift             1   
4                       WHERE SHE GOES         Bad Bunny             1   

   in_spotify_playlists  in_spotify_charts    streams  in_apple_playlists  \
0                   553                147  141381703                  43   
1                  1474                 48  133716286                  48   
2                  1397                113  140003974                  94   
3                  7858                100  800840817                 116   
4                  3133                 50  303236322                  84   

   in_apple_charts in_deezer_playlists  in_deezer_charts  ... key   mode  \
0              2

In [3]:
spotify_data.shape

(100, 22)

In [4]:
print(spotify_data.dtypes)

song_name                    object
artist_name                  object
artist_count                  int64
in_spotify_playlists          int64
in_spotify_charts             int64
streams                       int64
in_apple_playlists            int64
in_apple_charts               int64
in_deezer_playlists          object
in_deezer_charts              int64
in_shazam_charts             object
bpm                           int64
key                          object
mode                         object
danceability_percent        float64
valence_percent             float64
energy_percent              float64
acousticness_percent        float64
instrumentalness_percent    float64
liveness_percent            float64
speechiness_percent         float64
released_date                object
dtype: object


In [5]:
spotify_data['in_apple_playlists']

0      43
1      48
2      94
3     116
4      84
     ... 
95     69
96     49
97     58
98     89
99     20
Name: in_apple_playlists, Length: 100, dtype: int64

**Remove Unrelevant Variables**
-

Since I will only analysis Spotify, so I will remove all the variables that are unrelevant to the Spotify platform. 

In [6]:
columes_to_drop = spotify_data.loc[:, 'in_apple_playlists':'in_shazam_charts']

columes_to_drop.columns

Index(['in_apple_playlists', 'in_apple_charts', 'in_deezer_playlists',
       'in_deezer_charts', 'in_shazam_charts'],
      dtype='object')

In [7]:
clean_spotify_data = spotify_data.drop(columes_to_drop.columns, axis = 1)

In [8]:
clean_spotify_data

Unnamed: 0,song_name,artist_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,bpm,key,mode,danceability_percent,valence_percent,energy_percent,acousticness_percent,instrumentalness_percent,liveness_percent,speechiness_percent,released_date
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,553,147,141381703,125,B,Major,80.0,89.0,83.0,31.0,0.0,8.0,4.0,2023-7-14
1,LALA,Myke Towers,1,1474,48,133716286,92,C#,Major,71.0,61.0,74.0,7.0,0.0,10.0,4.0,2023-3-23
2,vampire,Olivia Rodrigo,1,1397,113,140003974,138,F,Major,51.0,32.0,53.0,17.0,0.0,31.0,6.0,2023-6-30
3,Cruel Summer,Taylor Swift,1,7858,100,800840817,170,A,Major,55.0,58.0,72.0,11.0,0.0,11.0,15.0,2019-8-23
4,WHERE SHE GOES,Bad Bunny,1,3133,50,303236322,144,A,Minor,65.0,23.0,80.0,14.0,63.0,11.0,6.0,2023-5-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,All My Life (feat. J. Cole),"J. Cole, Lil Durk",2,2175,23,144565150,143,D#,Major,83.0,69.0,44.0,15.0,0.0,10.0,33.0,2023-5-12
96,Say Yes To Heaven,Lana Del Rey,1,2000,46,127567540,100,F#,Minor,49.0,17.0,35.0,71.0,9.0,11.0,3.0,2023-3-17
97,Snooze,SZA,1,2839,25,399686758,143,F,Major,56.0,39.0,55.0,14.0,0.0,11.0,13.0,2022-12-9
98,Summertime Sadness,Lana Del Rey,1,20333,52,983637508,112,C#,Minor,56.0,24.0,66.0,7.0,0.0,12.0,3.0,2011-1-1


In [9]:
clean_spotify_data[clean_spotify_data['instrumentalness_percent'] > 0][['song_name', 'streams']]

Unnamed: 0,song_name,streams
4,WHERE SHE GOES,303236322
15,Kill Bill,1163093654
22,I Wanna Be Yours,1297026226
53,(It Goes Like) Nanana - Edit,57876440
60,TÃ¯Â¿Â½Ã¯Â¿,111947664
72,golden hour,751134527
73,Sweater Weather,2282771485
74,"Quevedo: Bzrp Music Sessions, Vol. 52",1356565093
85,El Merengue,223633238
88,Makeba,165484133


**Missing Data**
-

In [10]:
# Convert the missing data into NaN format
clean_spotify_data.replace('', np.nan, inplace = True)

In [11]:
count_missing_values = clean_spotify_data.isna().sum()

print(count_missing_values)

song_name                   0
artist_name                 0
artist_count                0
in_spotify_playlists        0
in_spotify_charts           0
streams                     0
bpm                         0
key                         8
mode                        0
danceability_percent        0
valence_percent             0
energy_percent              0
acousticness_percent        0
instrumentalness_percent    0
liveness_percent            0
speechiness_percent         0
released_date               0
dtype: int64


Found that there is a column, key, has 8 missing value. So, continue checking which songs have the missing value.

In [12]:
clean_spotify_data[clean_spotify_data['key'].isna() == True]

Unnamed: 0,song_name,artist_name,artist_count,in_spotify_playlists,in_spotify_charts,streams,bpm,key,mode,danceability_percent,valence_percent,energy_percent,acousticness_percent,instrumentalness_percent,liveness_percent,speechiness_percent,released_date
12,Flowers,Miley Cyrus,1,12211,115,1316855716,118,,Major,71.0,65.0,68.0,6.0,0.0,3.0,7.0,2023-1-12
17,What Was I Made For? [From The Motion Picture ...,Billie Eilish,1,873,104,30546883,78,,Major,44.0,14.0,9.0,96.0,0.0,10.0,3.0,2023-7-13
22,I Wanna Be Yours,Arctic Monkeys,1,12859,110,1297026226,135,,Minor,48.0,44.0,42.0,12.0,2.0,11.0,3.0,2013-1-1
35,Los del Espacio,"Big One, Duki, Lit Killah, Maria Becerra, FMK,...",8,1150,31,123122413,120,,Major,81.0,63.0,68.0,11.0,0.0,11.0,4.0,2023-6-1
44,Barbie World (with Aqua) [From Barbie The Album],"Nicki Minaj, Aqua, Ice Spice",3,1117,80,65156199,144,,Major,77.0,75.0,58.0,52.0,0.0,23.0,25.0,2023-6-23
46,I Ain't Worried,OneRepublic,1,8431,76,1085685420,140,,Major,71.0,82.0,81.0,11.0,0.0,6.0,5.0,2022-5-13
58,S91,Karol G,1,525,41,16011326,128,,Minor,86.0,42.0,72.0,59.0,0.0,9.0,19.0,2023-7-14
59,cardigan,Taylor Swift,1,7923,29,812019557,130,,Minor,61.0,53.0,58.0,55.0,0.0,27.0,4.0,2020-7-24


In [13]:
clean_spotify_data.at[12, 'key'] = 'A'
clean_spotify_data.at[17, 'key'] = 'C'
clean_spotify_data.at[22, 'key'] = 'C'
clean_spotify_data.at[35, 'key'] = 'C'
clean_spotify_data.at[44, 'key'] = 'C'
clean_spotify_data.at[46, 'key'] = 'F'
clean_spotify_data.at[58, 'key'] = 'G'
clean_spotify_data.at[59, 'key'] = 'C'

In [14]:
print(clean_spotify_data.iloc[[12, 17, 22, 35, 44, 46, 58, 59]]['key'])

12    A
17    C
22    C
35    C
44    C
46    F
58    G
59    C
Name: key, dtype: object
