In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
vatsalmavani_spotify_dataset_path = kagglehub.dataset_download('vatsalmavani/spotify-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""In this project, I analyzed Spotify song data to visualize changes in song features over the years and identify popular artists.
After cleaning and normalizing the data, I presented features like popularity and valence through various graphs and examined correlations."""

In [None]:
df_year=pd.read_csv('/kaggle/input/spotify-dataset/data/data_by_year.csv')
df_genres=pd.read_csv('/kaggle/input/spotify-dataset/data/data_by_genres.csv')
df_genres_=pd.read_csv('/kaggle/input/spotify-dataset/data/data_w_genres.csv')
df_artist=pd.read_csv('/kaggle/input/spotify-dataset/data/data_by_artist.csv')
data=pd.read_csv('/kaggle/input/spotify-dataset/data/data.csv')

In [None]:
df_year.head()

In [None]:
df_genres.head()

In [None]:
df_genres_.head()

In [None]:
df_artist.head()

In [None]:
data.head()

In [None]:
data.drop('id',axis=1,inplace=True)
duplicated_rows = data[data.duplicated()]
data.drop(duplicated_rows.index,axis=0,inplace=True)
data.reset_index(drop=True)

In [None]:
#normalization-'duration_ms','loudness','tempo'
scaler=StandardScaler()
data['loudness']= scaler.fit_transform(data['loudness'].values.reshape(-1,1))
data_=np.array(data[['duration_ms','tempo']])
data[['duration_ms','tempo']]=(data_-np.min(data_,axis=0))/(np.max(data_,axis=0)-np.min(data_,axis=0))

In [None]:
#Artist popularity ranking
artist_unique=[j for i in data['artists'].value_counts().index for j in i.split(',')]
popularity=[np.mean(data.loc[data['artists'].apply(lambda x:i in x.split(',')),'popularity']) for i in artist_unique]
popularity

In [None]:
#For each artist, the song that made them the most popular and the year in which this singer was most popular
values_=data['artists'].value_counts()
most_values=values_[values_.values>=10].index
new_data=data[data['artists'].isin(most_values)]
most_pop_song=[new_data.loc[(new_data['artists'].apply(lambda x:i in x.split(',')))&(np.max(new_data.loc[new_data['artists'].apply(lambda x:i in x.split(',')),'popularity'])==new_data.loc[new_data['artists'].apply(lambda x:i in x.split(',')),'popularity']),'name'] for i in artist_unique]
most_pop_year=[new_data.loc[(new_data['artists'].apply(lambda x:i in x.split(',')))&(np.max(new_data.loc[new_data['artists'].apply(lambda x:i in x.split(',')),'popularity'])==new_data.loc[new_data['artists'].apply(lambda x:i in x.split(',')),'popularity']),'year'] for i in artist_unique]

In [None]:
#Finding out whether this singer is more successful individually or alone
individual=[]
group=[]
for i in artist_unique:
    individual.append(np.mean(data.loc[(data['artists'].apply(lambda x:(i in x.split(','))&(len(x.split(','))==1))),'popularity']))
    group.append(np.mean(data.loc[(data['artists'].apply(lambda x:(i in x.split(','))&(len(x.split(','))>1))),'popularity']))

In [None]:
#popular artists by year and each artist's most popular song
def plot_bar_(data):
    year_=sorted(data['year'].value_counts().index)
    for j in year_:
        data_=data.loc[data['year']==j]
        artist=data_['artists'].value_counts().index
        popularity=[np.mean(data_.loc[data_['artists']==i,'popularity']) for i in artist]
        new_data=pd.DataFrame({'artists':artist,'popularity':popularity})
        new_data=new_data.sort_values(by='popularity',ascending=False)
        plt.figure(figsize=(6,8))
        sns.barplot(x=new_data['artists'][:20],y=new_data['popularity'][:20])
        plt.xticks(fontsize=5,rotation=90)
        plt.xlabel('artists')
        plt.ylabel('popularity')
        plt.title('{} yılı en popüler snatçılar'.format(j))
plot_bar_(data)

In [None]:
#graph of the most listened to song genres by year:
#graph of song characteristics by year:
features=['acousticness','danceability','energy','instrumentalness','liveness','loudness']
year_=sorted(data['year'].value_counts().index)
values_=[]
for i in range(len(features)):
    values=[]
    for j in year_:
        values.append(np.mean(data.loc[data['year']==j,features[i]]))
    values_.append(values)
trace1=go.Scatter(
           x=year_,
           y=values_[0],
           name=features[0]
)
trace2=go.Scatter(
           x=year_,
           y=values_[1],
           xaxis='x2',
           yaxis='y2',
           name=features[1]
)
trace3=go.Scatter(
           x=year_,
           y=values_[2],
           xaxis='x3',
           yaxis='y3',
           name=features[2]
)
trace4=go.Scatter(
           x=year_,
           y=values_[3],
           xaxis='x4',
           yaxis='y4',
           name=features[3]
)
trace5=go.Scatter(
           x=year_,
           y=values_[4],
           xaxis='x5',
           yaxis='y5',
           name=features[4]
)
trace6=go.Scatter(
           x=year_,
           y=values_[5],
           xaxis='x6',
           yaxis='y6',
           name=features[5]
)

data_1=[trace1,trace2,trace3,trace4,trace5,trace6]
layout=go.Layout(
          xaxis=dict(domain=[0,0.45]),xaxis2=dict(domain=[0.55,1]),xaxis3=dict(domain=[0,0.45],anchor='y3'),
          xaxis4=dict(domain=[0.55,1],anchor='y4'),xaxis5=dict(domain=[0,0.45],anchor='y5'),xaxis6=dict(domain=[0.55,1],anchor='y6'),
          yaxis=dict(domain=[0,0.27]),yaxis2=dict(domain=[0,0.27],anchor='x2'),yaxis3=dict(domain=[0.35,0.65],anchor='x3'),
          yaxis4=dict(domain=[0.35,0.65],anchor='x4'),yaxis5=dict(domain=[0.72,1],anchor='x5'),yaxis6=dict(domain=[0.72,1],anchor='x6'))

fig=go.Figure(data=data_1,layout=layout)
iplot(fig)

In [None]:
#Valence levels of the pieces from max to mine with bar plot according to years:

year_=sorted(data['year'].value_counts().index)
valence_list=[]
year_list=[]
for i in range(1,len(year_),3):
    valence_list.append(np.mean(data.loc[((data['year']==year_[i-1])|(data['year']==year_[i])|(data['year']==year_[i+1])),'valence']))
    year_list.append(year_[i])
new_data=pd.DataFrame({'years':year_list,'valence':valence_list})
new_data_=new_data.sort_values(by='years',ascending=False)
plt.figure(figsize=(7,12))
sns.barplot(x=new_data_.iloc[:,0],y=new_data_.iloc[:,1])
plt.xticks(rotation=45)
plt.xlabel('Years')
plt.ylabel('Valence')
plt.title('The valence value of each years')
plt.show()

In [None]:
data['release_date'].value_counts()

In [None]:
data.head()

In [None]:
data.columns

In [None]:
plt.figure(figsize=(12,15))
sns.heatmap(data[['valence', 'year', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'explicit', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'popularity',
       'speechiness', 'tempo']].corr(),annot=True)
plt.show()