In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_spotify=pd.read_csv('tracks.csv')
df_spotify.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [3]:
#to check the null values in the dataset
print(pd.isnull(df_spotify).sum())


id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64


[Section title](#Spotify-Info)

In [5]:
df_spotify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586672 entries, 0 to 586671
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                586672 non-null  object 
 1   name              586601 non-null  object 
 2   popularity        586672 non-null  int64  
 3   duration_ms       586672 non-null  int64  
 4   explicit          586672 non-null  int64  
 5   artists           586672 non-null  object 
 6   id_artists        586672 non-null  object 
 7   release_date      586672 non-null  object 
 8   danceability      586672 non-null  float64
 9   energy            586672 non-null  float64
 10  key               586672 non-null  int64  
 11  loudness          586672 non-null  float64
 12  mode              586672 non-null  int64  
 13  speechiness       586672 non-null  float64
 14  acousticness      586672 non-null  float64
 15  instrumentalness  586672 non-null  float64
 16  liveness          58

In [None]:
#sort the df otbo popularity column
sorted_df_pop=df_spotify.sort_values("popularity",ascending=True)
sorted_df_pop

In [None]:
df_spotify.describe().transpose()

# Top 10 most popular songs on spotify

In [None]:
most_popular=df_spotify.query('popularity>90',inplace=False ).sort_values('popularity',ascending=False)
#inplace = False means we are not making changes to the original df
most_popular.head(10)

In [None]:
# # set the release date column as the index of the dataframe
df_spotify.set_index('release_date', inplace=True)
df_spotify.index=pd.to_datetime(df_spotify.index,dayfirst=True)
df_spotify.head()

In [None]:
#find the artists that released songs on 1922-01-01
df_spotify[["artists"]].loc['1922-01-01']

# Convert the duration time from milliseconds to minutes

In [None]:
df_spotify['duration']=df_spotify['duration_ms'].apply(lambda x:(round(x/1000))/60)
df_spotify.drop('duration_ms',inplace=True,axis=1)

In [None]:
df_spotify.duration.head()

# Plot a correlation map
here error is in corr method, while finding the correlation, remove that first 

In [None]:
corr_df=df_spotify.drop(['key','mode','explicit'],axis=1).corr(method='pearson')
plt.figure(figsize=(14,6))
heatmap=sns.heatmap(corr_df,annot=True,fmt='.1g',vmin=-1,vmax=1,center=0,cmap='viridis',linewidths=1,linecolor='Black')
heatmap.set_title('Correlation Heatmap Between Variable')
heatmap.set_xticklabels(heatmap.get_xticklabels(),rotation=90)
plt.show()

# Create regression plots using .4% of whole data

In [None]:
sample_df=df_spotify.sample(int(0.004*len(df_spotify)))
print('Length of the sample_df=',len(sample_df))
sample_df

# Regression map between Loudness and Energy

In [None]:
#Plot Regression map between Loudness & Energy
plt.figure(figsize=(10,6))
sns.regplot(data=sample_df,y='loudness',x='energy',color='c').set(title='Loudness vs Energy Correlation')


The above graph denotes that the energy and loudness of the songs on Spotify are positively and strongly correlated.

# Correlation between Popularity and Acousticness

In [None]:
#plot regression map between popularity and acousticness 
plt.figure(figsize=(10,6))
sns.regplot(data=sample_df,y='popularity',x='acousticness',color='b').set(title='Popularity vs Acousticness Correlation')


The above graph denotes that Popularity and Acousticness of songs on Spotify are negatively and weakly correlated.

In [None]:
#Create a new column called 'dates' in the df_spotify
df_spotify['dates']=df_spotify.index.get_level_values('release_date')
df_spotify.dates=pd.to_datetime(df_spotify.dates)
years=df_spotify.dates.dt.year #extract years from the dates

In [None]:
type(years)

# Total number of songs vs Years

In [None]:
#plot the distribution of total no. of songs in each year since 1922 on spotify
sns.displot(years,discrete=True,aspect=2,height=5,kind='hist').set(title='Number of songs per year')

# Total Duration of Songs Vs Years

In [None]:
total_dr=df_spotify.duration
fig_dims=(18,7)
fig,ax=plt.subplots(figsize=fig_dims)
fig=sns.barplot(x=years,y=total_dr,ax=ax,errwidth=False).set(title='Year vs Duration')
plt.xticks(rotation=90)
plt.show()

In [None]:
df_spotify

In [None]:
sample_df

# Genre dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_genre=pd.read_csv('SpotifyFeatures.csv')

In [None]:
df_genre.head()

In [None]:
Popular_genre=df_genre[['genre', 'popularity']].sort_values(by="popularity",ascending=False)
Popular_genre=Popular_genre.drop_duplicates(subset = "popularity")
#Popular_genre=Popular_genre.head(25)
Popular_genre

In [None]:

V=[]
for i in Popular_genre.genre.value_counts().index.tolist():
    c = Popular_genre[Popular_genre['genre'].str.contains(i)]['popularity'].sum()
    V.append(c)
 
#Initialize data
genre = Popular_genre.genre.value_counts().index.tolist()
popularity = V

# Create a pandas dataframe
pop_vs_genre_df = pd.DataFrame({"genre": genre,
                   "popularity": popularity})

In [None]:
pop_vs_genre_df.sort_values('popularity')

In [None]:
plt.title('Popularity of the Songs in Different Genres')
sns.color_palette('rocket',as_cmap=True)
plt.figure(figsize=(15,8))
sns.barplot(y='genre',x='popularity',data=pop_vs_genre_df,order=pop_vs_genre_df.sort_values('popularity',ascending = False).genre,ci=None)
plt.xlabel('Popularity')
plt.ylabel('Genre')
plt.show()