In [34]:
import pandas as pd
import plotly.express as px   #importing plotly
from scipy.stats import pearsonr 
import plotly.graph_objects as go   

▶ **Pre-processing the data**

In [43]:
data = pd.read_csv("/content/drive/MyDrive/spotify/tracks.csv")
data_artist = pd.read_csv("/content/drive/MyDrive/spotify/artists.csv")
data
data_artist

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0
...,...,...,...,...,...
1104344,6rJIG42vcWAf1UBdRFlQxB,3345.0,[],Cody Longo,8
1104345,1ljurfXKPlGncNdW3J8zJ8,2123.0,['deep acoustic pop'],Right the Stars,18
1104346,2vnT9YhKIvjVo9LnVjWmr2,26.0,[],Jesse Giddings,0
1104347,3ID0E5XCvnJIYZEq043ZoB,406.0,[],The Boy Band Project,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                586672 non-null  object 
 1   name              586601 non-null  object 
 2   popularity        586672 non-null  int64  
 3   duration_ms       586672 non-null  int64  
 4   explicit          586672 non-null  int64  
 5   artists           586672 non-null  object 
 6   id_artists        586672 non-null  object 
 7   release_date      586672 non-null  object 
 8   danceability      586672 non-null  float64
 9   energy            586672 non-null  float64
 10  key               586672 non-null  int64  
 11  loudness          586672 non-null  float64
 12  mode              586672 non-null  int64  
 13  speechiness       586672 non-null  float64
 14  acousticness      586672 non-null  float64
 15  instrumentalness  586672 non-null  float64
 16  liveness          58

In [5]:
# missing data  
data.isnull().sum()

id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64

In [6]:
# summary
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
popularity,586672.0,27.570053,18.370642,0.0,13.0,27.0,41.0,100.0
duration_ms,586672.0,230051.167286,126526.087418,3344.0,175093.0,214893.0,263867.0,5621218.0
explicit,586672.0,0.044086,0.205286,0.0,0.0,0.0,0.0,1.0
danceability,586672.0,0.563594,0.166103,0.0,0.453,0.577,0.686,0.991
energy,586672.0,0.542036,0.251923,0.0,0.343,0.549,0.748,1.0
key,586672.0,5.221603,3.519423,0.0,2.0,5.0,8.0,11.0
loudness,586672.0,-10.206067,5.089328,-60.0,-12.891,-9.243,-6.482,5.376
mode,586672.0,0.658797,0.474114,0.0,0.0,1.0,1.0,1.0
speechiness,586672.0,0.104864,0.179893,0.0,0.034,0.0443,0.0763,0.971
acousticness,586672.0,0.449863,0.348837,0.0,0.0969,0.422,0.785,0.996


✔  we should change the release_date to a date type and then put months and years into separate columns.

In [14]:
data[["year", "month", "day"]] = data["release_date"].str.split("-", expand = True)
data[["year", "month", "day"]]

Unnamed: 0,year,month,day
0,1922,02,22
1,1922,06,01
2,1922,03,21
3,1922,03,21
4,1922,,
...,...,...,...
586667,2020,09,26
586668,2020,10,21
586669,2020,09,02
586670,2021,03,05



✅ **What are the most popular songs right now?**

In [15]:
most_popular = data.query('popularity>90', inplace=False).sort_values('popularity', ascending=False)
most_popular[:10]

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,day,month,year
93802,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),100,198082,1,"['Justin Bieber', 'Daniel Caesar', 'Giveon']","['1uNFoZAHBGtllmzznpCI3s', '20wkVLutqVOYrc0kxF...",2021-03-19,0.677,0.696,...,0.119,0.321,0.0,0.42,0.464,90.03,4,19,3,2021
93803,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,['Olivia Rodrigo'],['1McMsnEElThX1knmY4oliG'],2021-01-08,0.585,0.436,...,0.0601,0.721,1.3e-05,0.105,0.132,143.874,4,8,1,2021
93804,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,98,132780,0,['Masked Wolf'],['1uU7g3DNSbsu0QjSEqZtEd'],2021-01-06,0.778,0.695,...,0.0913,0.175,0.0,0.15,0.472,149.996,4,6,1,2021
92810,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,1,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020-03-20,0.68,0.826,...,0.0309,0.0212,1.2e-05,0.543,0.644,118.051,4,20,3,2020
92811,6tDDoYIxWvMLTdKpjFkc1B,telepatía,97,160191,0,['Kali Uchis'],['1U1el3k54VvEUzo3ybLPlM'],2020-12-04,0.653,0.524,...,0.0502,0.112,0.0,0.203,0.553,83.97,4,4,12,2020
92813,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,96,200040,0,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020-03-20,0.514,0.73,...,0.0598,0.00146,9.5e-05,0.0897,0.334,171.005,4,20,3,2020
93805,7MAibcTli4IisCtbHKrGMh,Leave The Door Open,96,242096,0,"['Bruno Mars', 'Anderson .Paak', 'Silk Sonic']","['0du5cEVh5yTK9QJze8zA0C', '3jK9MiCrA42lLAdMGU...",2021-03-05,0.586,0.616,...,0.0324,0.182,0.0,0.0927,0.719,148.088,4,5,3,2021
92814,6f3Slt0GbA2bPZlz0aIFXN,The Business,95,164000,0,['Tiësto'],['2o5jDhtHVPhrJdv3cEQ99Z'],2020-09-16,0.798,0.62,...,0.232,0.414,0.0192,0.112,0.235,120.031,4,16,9,2020
91866,60ynsPSSKe6O3sfwRnIBRf,Streets,94,226987,1,['Doja Cat'],['5cj0lLjcoR7YOSnhnX0Po5'],2019-11-07,0.749,0.463,...,0.0828,0.208,0.0371,0.337,0.19,90.028,4,7,11,2019
92816,3FAJ6O0NOHQV8Mc5Ri6ENp,Heartbreak Anniversary,94,198371,0,['Giveon'],['4fxd5Ee7UefO4CUXgwJ7IP'],2020-03-27,0.449,0.465,...,0.0791,0.524,1e-06,0.303,0.543,89.087,3,27,3,2020


✔ Sort the filtered values and show the columns of interest

In [16]:
pop_date = most_popular.sort_values('release_date', ascending=False)
pop_date[['name', 'popularity', 'explicit','release_date']][:20]

Unnamed: 0,name,popularity,explicit,release_date
93802,Peaches (feat. Daniel Caesar & Giveon),100,1,2021-03-19
93805,Leave The Door Open,96,0,2021-03-05
93815,What’s Next,91,1,2021-03-05
93811,Hold On,92,0,2021-03-05
93816,We're Good,91,0,2021-02-11
93813,911,91,1,2021-02-05
93809,Up,92,1,2021-02-05
93806,Fiel,94,0,2021-02-04
93808,Ella No Es Tuya - Remix,92,0,2021-02-03
93812,Wellerman - Sea Shanty / 220 KID x Billen Ted ...,92,0,2021-01-21


☹ March 2020 the world went under a complete lockdown because of the Covid-19.  
✔ We try to know songs that released in March 2020 and their popularity

In [20]:
most_popular_march_20 = data.query('(popularity > 80) and (year in ["2020"]) and (month in ["03"])')
most_popular_march_20[['id', 'name', 'explicit','popularity','year','month']][:20]

Unnamed: 0,id,name,explicit,popularity,year,month
92810,5QO79kh1waicV47BqGRL3g,Save Your Tears,1,97,2020,3
92813,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,0,96,2020,3
92816,3FAJ6O0NOHQV8Mc5Ri6ENp,Heartbreak Anniversary,0,94,2020,3
92853,4xqrdfXkTW4T0RauPLv3WA,Heather,0,89,2020,3
92867,5nujrmhLynf4yMoMtj8AQF,Levitating (feat. DaBaby),0,89,2020,3
92927,7szuecWAPwGoV1e5vGu8tl,In Your Eyes,1,86,2020,3
92951,6KfoDhO4XUWSbnyKjNp9c4,Maniac,0,86,2020,3
92961,3PfIrDoz19wz7qK7tYeu62,Don't Start Now,0,85,2020,3
92995,5m5aY6S9ttfIG157xli2Rs,Alô Ambev (Segue Sua Vida) - Ao Vivo,0,84,2020,3
93021,527k23H0A4Q0UJN3vGs0Da,After Party,1,84,2020,3


✅**How do different features of a song impact its popularity?**

In [21]:
data1=data.groupby('popularity')['danceability'].mean().sort_values(ascending=[False]).reset_index()
data1.head()

Unnamed: 0,popularity,danceability
0,95,0.798
1,98,0.778
2,91,0.751091
3,88,0.727105
4,85,0.7126


✔ This dataframe will have the popularity for different songs grouped by the mean of the danceability score.

In [30]:
fig1 = px.scatter(data1, x="popularity", y="danceability", color="danceability",size='popularity')
fig1.show()

✔ We release that 'popularity' and 'danceability' are positively correlated, which implies that, as the popularity of the song increase, the danceability score for that song also increases.

✅ Calculate Pearson's Correlation Constant 'r' for two different features.
The following are the three conditions for the Pearson's Correlation Coefficient 'r':-
r>0, implies, positive correlation
r=0, implies, no correlation
r<0, implies, negative correlation. 

In [27]:
data_1 = data1['popularity']
data_2 = data1['danceability']

# calculate Pearson's correlation
corr, _ = pearsonr(data_1, data_2)
print('Pearsons correlation: %.3f' % corr)

Pearsons correlation: 0.899


In [35]:
matrix=data.corr()  #returns a matrix with correlation of all features
x_list=['popularity','duration_ms','explicit',
        'danceability','energy','key','loudness',
        'mode','speechiness','acousticness','instrumentalness',
        'liveness','valence','tempo','time_signature']

fig_heatmap = go.Figure(data=go.Heatmap(
                   z=matrix,
                   x=x_list,
                   y=x_list,
                   hoverongaps = False))
fig_heatmap.update_layout(margin = dict(t=200,r=200,b=200,l=200),
    width = 800, height = 650,
    autosize = False )

fig_heatmap.show()

✅  We observe that there is no significant positive correlation between popularity and a song's feature. The most positive correlation occurs between popularity, danceability, loudness, and energy.

✅**MOST POPULAR ARTIST**

In [45]:
artists_popular = data_artist.sort_values(by=['popularity'], ascending=False).reset_index()
artists_popular[:10]

Unnamed: 0,index,id,followers,genres,name,popularity
0,144481,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",Justin Bieber,100
1,115489,4q3ewBCX7sLwd24euuV69X,32244734.0,"['latin', 'reggaeton', 'trap latino']",Bad Bunny,98
2,126338,06HL4z0CvFAxyc27GXpf02,38869193.0,"['pop', 'post-teen pop']",Taylor Swift,98
3,313676,3TVXtAsR1Inumwj472S9r4,54416812.0,"['canadian hip hop', 'canadian pop', 'hip hop'...",Drake,98
4,144484,3Nrfpe0tUJi4K4DXYWgMUX,31623813.0,"['k-pop', 'k-pop boy group']",BTS,96
5,115490,4MCBfE4596Uoi2O4DtmEMz,16996777.0,"['chicago rap', 'melodic rap']",Juice WRLD,96
6,144483,1Xyo4u8uXC1ZmMpatF05PJ,31308207.0,"['canadian contemporary r&b', 'canadian pop', ...",The Weeknd,96
7,144485,66CXWjxzNUsdJxJ2JdwvnR,61301006.0,"['pop', 'post-teen pop']",Ariana Grande,95
8,144486,1vyhD5VmyZ7KMfW5gqLgo5,27286822.0,"['latin', 'reggaeton', 'reggaeton colombiano',...",J Balvin,95
9,115491,7iK8PXO48WeuP03g8YR51W,5001808.0,['trap latino'],Myke Towers,95


▶ **Analyzing the Genres**

In [None]:
data_artist[data_artist["genres"]=='[]']
df_genre=data_artist[data_artist["genres"]!='[]']
df_genre.head()

▶ We observe that the column 'genres' has a list passed as value. Let's split these lists into individual values.

In [47]:
df_sort_genres=pd.DataFrame(df_genre.assign(genres=df_genre.genres.str.split(",")).explode('genres'))
df_sort_genres.tail()

Unnamed: 0,id,followers,genres,name,popularity
1104328,1q9C5XlekzXbRLIuLCDTre,90087.0,'teen pop'],Brent Rivera,33
1104331,4fh2BIKYPFvXFsQLhaeVJp,309.0,['la indie'],Lone Kodiak,20
1104334,7akMsd2vb4xowNTehv3gsY,774.0,['indie rockism'],The Str!ke,0
1104336,35m7AJrUCtHYHyIUhCzmgi,205.0,['indie rockism'],Hunter Fraser,6
1104345,1ljurfXKPlGncNdW3J8zJ8,2123.0,['deep acoustic pop'],Right the Stars,18


In [48]:
df_sort_genres['genres']=df_sort_genres.genres.str.replace('[',' ')
df_sort_genres['genres']=df_sort_genres.genres.str.replace(']',' ')


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [49]:
# get top 30 most commom genres
n = 30
top_30=pd.DataFrame(df_sort_genres['genres'].value_counts()[:n]).reset_index()
top_30.rename(columns = {'index':'Genres','genres':'Total_Count'}, inplace = True)
top_30

Unnamed: 0,Genres,Total_Count
0,'dance pop',551
1,'latin',483
2,'electro house',478
3,'pop',461
4,'hip hop',455
5,'edm',455
6,'electropop',432
7,'indie rock',411
8,'classical performance',407
9,'tropical',402


In [50]:
fig3 = px.pie(top_30, values='Total_Count', names='Genres')
fig3.show()