In [3]:
import pandas as pd
import plotly.express as px

In [4]:
data = pd.read_csv("dataset.csv")

In [5]:
data = data.drop(data.index[[0,1]])
del data['Unnamed: 0']
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113998 entries, 2 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          113998 non-null  object 
 1   artists           113997 non-null  object 
 2   album_name        113997 non-null  object 
 3   track_name        113997 non-null  object 
 4   popularity        113998 non-null  int64  
 5   duration_ms       113998 non-null  int64  
 6   explicit          113998 non-null  bool   
 7   danceability      113998 non-null  float64
 8   energy            113998 non-null  float64
 9   key               113998 non-null  int64  
 10  loudness          113998 non-null  float64
 11  mode              113998 non-null  int64  
 12  speechiness       113998 non-null  float64
 13  acousticness      113998 non-null  float64
 14  instrumentalness  113998 non-null  float64
 15  liveness          113998 non-null  float64
 16  valence           11

In [6]:
print(data.head(10))

                  track_id                               artists  \
2   1iJBSr7s7jYXzM8EGcbK5b                Ingrid Michaelson;ZAYN   
3   6lfxq3CG4xtTiEg7opyCyx                          Kina Grannis   
4   5vjLSffimiIP26QG5WcN2K                      Chord Overstreet   
5   01MVOl9KtVTNfFiBU9I7dc                          Tyrone Wells   
6   6Vc5wAMmXdKIAM7WUoEb7N  A Great Big World;Christina Aguilera   
7   1EzrEOXmMH3G43AXT1y7pA                            Jason Mraz   
8   0IktbUcnAGrvD03AWnz3Q8             Jason Mraz;Colbie Caillat   
9   7k9GuJYLp2AzqokyEdwEw2                        Ross Copperman   
10  4mzP5mHkRvGxdhdGdAH7EJ                          Zack Tabudlo   
11  5ivF4eQBqJiVL5IAE9jRyl                            Jason Mraz   

                                           album_name  \
2                                      To Begin Again   
3   Crazy Rich Asians (Original Motion Picture Sou...   
4                                             Hold On   
5                      

In [7]:
print(data.describe())

          popularity   duration_ms   danceability         energy  \
count  113998.000000  1.139980e+05  113998.000000  113998.000000   
mean       33.237995  2.280298e+05       0.566800       0.641389   
std        22.304870  1.072984e+05       0.173543       0.251527   
min         0.000000  0.000000e+00       0.000000       0.000000   
25%        17.000000  1.740660e+05       0.456000       0.472000   
50%        35.000000  2.129060e+05       0.580000       0.685000   
75%        50.000000  2.615060e+05       0.695000       0.854000   
max       100.000000  5.237295e+06       0.985000       1.000000   

                 key       loudness           mode    speechiness  \
count  113998.000000  113998.000000  113998.000000  113998.000000   
mean        5.309216      -8.258895       0.637555       0.084652   
std         3.559973       5.029309       0.480708       0.105733   
min         0.000000     -49.531000       0.000000       0.000000   
25%         2.000000     -10.013000       

In [8]:
print(data.isna().sum())

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64


In [9]:
print(data.duplicated().sum())

450


In [11]:
artist_counts = data['artists'].value_counts().reset_index()
artist_counts.columns = ['artists', 'count']
top_artists = artist_counts.head(20)
print(top_artists)

                  artists  count
0             The Beatles    279
1            George Jones    271
2           Stevie Wonder    236
3             Linkin Park    224
4         Ella Fitzgerald    222
5           Prateek Kuhad    217
6                    Feid    202
7             Chuck Berry    190
8         Håkan Hellström    183
9             OneRepublic    181
10         The Beach Boys    176
11      my little airport    171
12      Charlie Brown Jr.    169
13          Elvis Presley    169
14  Red Hot Chili Peppers    159
15            Bryan Adams    157
16                Scooter    155
17           Daddy Yankee    154
18         Arctic Monkeys    152
19                    BTS    151


In [12]:
fig = px.histogram(
    top_artists,
    x='artists',
    y='count',
    labels={'artists': 'Artist', 'count': 'Frequency of Artists'},
    title='Top Artists'
)
fig.show()

In [13]:
fig = px.scatter(
    top_artists,
    x='artists',
    y='count',
    labels={'artists': 'Artist', 'count': 'Number of Occurrences'},
    title='Frequency of Artists'
)
fig.show()

In [14]:
data_filtered = data[data['popularity'] !=0]
popular_counts = data_filtered[['popularity', 'track_name']].value_counts().reset_index()
popular_counts.columns = ['popularity','track_name', 'count']
print(popular_counts)

       popularity             track_name  count
0               1                Happier     20
1               2              Halloween     20
2               2               RUMBATÓN     18
3               1          Qué Más Pues?     18
4               2           X ÚLTIMA VEZ     17
...           ...                    ...    ...
80252          33                Le club      1
80253          33         Leave It To Me      1
80254          33             Leev Marie      1
80255          33  Lembro Quando Começou      1
80256          37                   Nova      1

[80257 rows x 3 columns]


In [15]:
# Sort and take only the top 20 songs
popular_songs = popular_counts.head(20)

In [16]:
# Create the histogram
fig = px.histogram(
    popular_songs,
    x='track_name',
    y='count',
    labels={'track_name': 'Track Name', 'count': 'Popularity of Track'},
    title='Popularity of a Track'
)
fig.update_layout(xaxis_tickangle=-45)
fig.update_yaxes(range=[0, 50])
# Display the plot
fig.show()

In [17]:
fig = px.scatter(
    popular_songs,
    x='track_name',
    y='count',
    labels={'track_name': 'Track Name', 'count': 'Popularity of Track'},
    title='Popularity of a Track'
)
fig.update_layout(xaxis_tickangle=-45)
fig.update_yaxes(range=[0, 30])
fig.show()

In [54]:
popularity_by_genre = data.groupby('track_genre')['popularity'].mean().reset_index()
popularity_by_genre.columns = ['track_genre', 'average_popularity']
print(popularity_by_genre.head())

   track_genre  average_popularity
0     acoustic            42.43988
1     afrobeat            24.39900
2     alt-rock            33.94300
3  alternative            24.33700
4      ambient            44.19100


In [59]:
fig = px.histogram(
    popularity_by_genre,
    x='track_genre',
    y='average_popularity',
    nbins=30,
    labels={'average_popularity': 'Average Popularity'},
    title='Distribution of Average Track Popularity by Genre'
)
fig.show()

In [57]:
fig = px.scatter(
    popularity_by_genre,
    x='track_genre',
    y='average_popularity',
    labels={'track_genre': 'Genre', 'average_popularity': 'Average Popularity'},
    title='Average Popularity of Tracks by Genre'
)
fig.show()