In [296]:
# This project is to help better predict the success of future music productions. By analysising a compiled list of Top Tracks from Spotify, I will use analysis to see what common factors these tracks have in common. This can help better predict the commercial success for music production and distribution companies.

In [297]:
# import packages 
import pandas as pd
import plotly.express as px

In [298]:
# read in the data set being used 
data = pd.read_csv("dataset.csv")
data.index.name = 'index'
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [299]:
data = data.drop(data.index[[0,1]])
del data['Unnamed: 0']

In [300]:
#convert milliseconds to seconds
data['duration_sec'] = data['duration_ms'] / 1000

data['duration_sec'] = pd.to_numeric(data['duration_ms'], errors='coerce') / 1000

#Function to convert seconds to minutes:seconds
def convert_seconds_to_minutes(seconds):
    if pd.isna(seconds):
        return 'N/A'
    minutes = int(seconds // 60)
    seconds = int(seconds % 60)
    return f"{minutes}:{seconds:02d}"
data['duration_min_sec'] = data['duration_sec'].apply(convert_seconds_to_minutes)
data = data.drop(columns=['duration_sec'])

In [301]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113998 entries, 2 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          113998 non-null  object 
 1   artists           113997 non-null  object 
 2   album_name        113997 non-null  object 
 3   track_name        113997 non-null  object 
 4   popularity        113998 non-null  int64  
 5   duration_ms       113998 non-null  int64  
 6   explicit          113998 non-null  bool   
 7   danceability      113998 non-null  float64
 8   energy            113998 non-null  float64
 9   key               113998 non-null  int64  
 10  loudness          113998 non-null  float64
 11  mode              113998 non-null  int64  
 12  speechiness       113998 non-null  float64
 13  acousticness      113998 non-null  float64
 14  instrumentalness  113998 non-null  float64
 15  liveness          113998 non-null  float64
 16  valence           11

In [302]:
# print an example of the DataFrame, the first 10 rows 
print(data.head(10))

                     track_id                               artists  \
index                                                                 
2      1iJBSr7s7jYXzM8EGcbK5b                Ingrid Michaelson;ZAYN   
3      6lfxq3CG4xtTiEg7opyCyx                          Kina Grannis   
4      5vjLSffimiIP26QG5WcN2K                      Chord Overstreet   
5      01MVOl9KtVTNfFiBU9I7dc                          Tyrone Wells   
6      6Vc5wAMmXdKIAM7WUoEb7N  A Great Big World;Christina Aguilera   
7      1EzrEOXmMH3G43AXT1y7pA                            Jason Mraz   
8      0IktbUcnAGrvD03AWnz3Q8             Jason Mraz;Colbie Caillat   
9      7k9GuJYLp2AzqokyEdwEw2                        Ross Copperman   
10     4mzP5mHkRvGxdhdGdAH7EJ                          Zack Tabudlo   
11     5ivF4eQBqJiVL5IAE9jRyl                            Jason Mraz   

                                              album_name  \
index                                                      
2                          

In [303]:
# printing a little bit more detailed information than just the first 10 rows
print(data.describe())

          popularity   duration_ms   danceability         energy  \
count  113998.000000  1.139980e+05  113998.000000  113998.000000   
mean       33.237995  2.280298e+05       0.566800       0.641389   
std        22.304870  1.072984e+05       0.173543       0.251527   
min         0.000000  0.000000e+00       0.000000       0.000000   
25%        17.000000  1.740660e+05       0.456000       0.472000   
50%        35.000000  2.129060e+05       0.580000       0.685000   
75%        50.000000  2.615060e+05       0.695000       0.854000   
max       100.000000  5.237295e+06       0.985000       1.000000   

                 key       loudness           mode    speechiness  \
count  113998.000000  113998.000000  113998.000000  113998.000000   
mean        5.309216      -8.258895       0.637555       0.084652   
std         3.559973       5.029309       0.480708       0.105733   
min         0.000000     -49.531000       0.000000       0.000000   
25%         2.000000     -10.013000       

In [304]:
#print total count of empty values 
print(data.isna().sum())

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
duration_min_sec    0
dtype: int64


In [305]:
#found the number of occurrences of each unique artist, then created it's own DF with columns named artists and counts
#selected the top 20 artists and printed that list
artist_counts = data['artists'].value_counts().reset_index()
artist_counts.columns = ['artists', 'count']
top_artists = artist_counts.head(20)
print(top_artists)

                  artists  count
0             The Beatles    279
1            George Jones    271
2           Stevie Wonder    236
3             Linkin Park    224
4         Ella Fitzgerald    222
5           Prateek Kuhad    217
6                    Feid    202
7             Chuck Berry    190
8         Håkan Hellström    183
9             OneRepublic    181
10         The Beach Boys    176
11      my little airport    171
12      Charlie Brown Jr.    169
13          Elvis Presley    169
14  Red Hot Chili Peppers    159
15            Bryan Adams    157
16                Scooter    155
17           Daddy Yankee    154
18         Arctic Monkeys    152
19                    BTS    151


In [306]:
#created histogram of those top 20 artists
fig = px.histogram(
    top_artists,
    x='artists',
    y='count',
    labels={'artists': 'Artist', 'count': 'Frequency of Artists'},
    title='Top Artists'
)
fig.show()

In [307]:
#created scatter plot of the same top 20 artists
fig_1 = px.scatter(
    top_artists,
    x='artists',
    y='count',
    labels={'artists': 'Artist', 'count': 'Number of Occurrences'},
    title='Frequency of Artists'
)
fig_1.show()

In [308]:
print(data['popularity'])

index
2         57
3         71
4         82
5         58
6         74
          ..
113995    21
113996    22
113997    22
113998    41
113999    22
Name: popularity, Length: 113998, dtype: int64


In [309]:
#remove rows where the popularity column is at 0
#count the occurrences of each unique combination of popularity and track_name
data_filtered = data[data['popularity'] !=0]
print(data_filtered)

                      track_id                               artists  \
index                                                                  
2       1iJBSr7s7jYXzM8EGcbK5b                Ingrid Michaelson;ZAYN   
3       6lfxq3CG4xtTiEg7opyCyx                          Kina Grannis   
4       5vjLSffimiIP26QG5WcN2K                      Chord Overstreet   
5       01MVOl9KtVTNfFiBU9I7dc                          Tyrone Wells   
6       6Vc5wAMmXdKIAM7WUoEb7N  A Great Big World;Christina Aguilera   
...                        ...                                   ...   
113995  2C3TZjDRiAzdyViavDJ217                         Rainy Lullaby   
113996  1hIz5L4IB9hN3WRYPOCGPw                         Rainy Lullaby   
113997  6x8ZfSoqDjuNa5SVP5QjvX                         Cesária Evora   
113998  2e6sXL2bYv4bSz6VTdnfLs                      Michael W. Smith   
113999  2hETkH7cOfqmz3LqZDHZf5                         Cesária Evora   

                                               album_name  \
in

In [310]:
# Sort the DataFrame by multiple columns and then select the top 20 rows
popular_counts = data_filtered.sort_values(
    by=['popularity', 'track_name', 'duration_min_sec', 'danceability', 'energy', 'key', 'tempo'],
    ascending=False
).head(20)

In [311]:
# Count occurrences of each unique combination of the selected columns
popular_counts = popular_counts.value_counts(subset=['popularity', 'track_name', 'duration_min_sec', 'danceability', 'energy', 'key', 'tempo']).reset_index(name='count')

In [312]:
top_songs = data_filtered.sort_values(by='popularity', ascending=False).drop_duplicates(subset=['track_name'])

# Select the top 20 rows
top_20_songs = top_songs.head(20)

print(top_20_songs[['track_name', 'popularity', 'duration_min_sec', 'danceability', 'energy', 'key', 'tempo']])

                                  track_name  popularity duration_min_sec  \
index                                                                       
81051              Unholy (feat. Kim Petras)         100             2:36   
51664  Quevedo: Bzrp Music Sessions, Vol. 52          99             3:18   
20008                        I'm Good (Blue)          98             2:55   
89411                             La Bachata          98             2:42   
68304                       Tití Me Preguntó          97             4:03   
68305                        Me Porto Bonito          97             2:58   
20000                    Under The Influence          96             3:04   
68359                                 Efecto          96             3:33   
91003                        I Ain't Worried          96             2:28   
81052                              As It Was          95             2:47   
67500                          Ojitos Lindos          95             4:18   

In [313]:
# Now that we have the top 20 songs and descriptions, we can do further analysis to find the commonalities between them.
# Find the average time and standard deviation of these top 20 songs

In [314]:
# First must convert minutes:seconds to total seconds
def min_sec_to_seconds(min_sec):
    minutes, seconds = map(int, min_sec.split(':'))
    return minutes * 60 + seconds

# Use function to convert duration_min_sec to total seconds
top_20_songs['duration_sec'] = top_20_songs['duration_min_sec'].apply(min_sec_to_seconds)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [321]:
#Calculate average duration in seconds
avg_duration_secs = top_20_songs['duration_sec'].mean()

#Calculate the standard deviation in seconds
std_dev_sec = top_20_songs['duration_sec'].std()

avg_duration_min_sec = convert_seconds_to_minutes(avg_duration_secs)
std_dev_min_sec = convert_seconds_to_minutes(std_dev_sec)

print('The average duration is', avg_duration_min_sec)
print('The standard deviation of duration is', std_dev_min_sec)

The average duration is 3:23
The standard deviation of duration is 0:35


In [None]:
#Find the average, and standard deviation, for danceability of these top 20 tracks
#Danceability describes how suitable a track is for dancing, a value of 0.0 is least danceable and 1.0 is most danceable

In [322]:
#Calculate average danceability
avg_danceability = top_20_songs['danceability'].mean()

#Calculate the standard deviation for danceability
std_dev_danceability = top_20_songs['danceability'].std()

print('The average danceability is', avg_danceability)
print('The standard deviation of danceability is', std_dev_danceability)

The average danceability is 0.71375
The standard deviation of danceability is 0.14936704173911572


In [326]:
#To provide easier use of this data, will round to 2 decimal places for clarity
avg_danceability_rounded = round(avg_danceability, 2)
std_dev_danceability_rounded = round(std_dev_danceability, 2)

print(f'The average danceability rounded is {avg_danceability_rounded}')
print(f'The standard deviation of danceability rounded is {std_dev_danceability_rounded}')


The average danceability rounded is 0.71
The standard deviation of danceability rounded is 0.15


In [327]:
#Find the average, and standard deviation, for the energy of these top 20 tracks
#Energy is a measure from 0.0 to 1.0 and represents a perceptual meausre of intesity and activity; these tracks will feel fast, loud, and noisy.

#Calculate the average energy for these top 20 tracks
avg_energy = top_20_songs['energy'].mean()

#Calculate the standard deviation for top 20 tracks
std_dev_energy = top_20_songs['energy'].std()

print(f'The average energy  is {avg_energy}')
print(f'The standard deviation of energy is {std_dev_energy}')

The average energy  is 0.67195
The standard deviation of energy is 0.1594693420461487


In [328]:
#To make this information easier to use, will round to 2 decimal places for clarity

avg_energy_rounded = round(avg_energy, 2)
std_dev_energy_rounded = round(std_dev_energy, 2)

print(f'The average energy rounded is {avg_energy_rounded}')
print(f'The standard deviation of energy rounded is {std_dev_energy_rounded}')

The average energy rounded is 0.67
The standard deviation of energy rounded is 0.16


In [329]:
#Find the most commonly used keys for these top 20 tracks
#The key is a scale of notes that forms the basis of a song, and is indicated by a key signature at the beginning of the sheet music
#Integers map to pitches using standard Pitch Class, 0 = C, 1 = C♯/D♭, 2 = D, 3 = D♯/E♭, 4 = E, 5 = F, 6 = F♯/G♭, 7 = G, 8 = G♯/A♭, 9 = A, 10 = A♯/B♭, 11 = B, and -1 = no key was detected

#Count the occurences for each key for these top 20 tracks
key_counts = top_20_songs['key'].value_counts()
print(key_counts)

key
7     4
2     3
5     2
1     2
10    2
9     1
0     1
6     1
3     1
8     1
4     1
11    1
Name: count, dtype: int64


In [330]:
#Calculate how frequently each key was used in these top 20 tracks

key_frequencies = key_counts / 20 #There are 20 tracks in total being assessed, that is why we divide by 20
print(key_frequencies)

key
7     0.20
2     0.15
5     0.10
1     0.10
10    0.10
9     0.05
0     0.05
6     0.05
3     0.05
8     0.05
4     0.05
11    0.05
Name: count, dtype: float64


In [331]:
#Find the average, and standard deviation, for the tempo of these top 20 tracks.
#Tempo is the speed or pace of a given piece of music and derives directly from the average beat per minute.

#Calculate the average tempo
avg_tempo = top_20_songs['tempo'].mean()

#Calculate the standard deviation for tempo
std_dev_tempo = top_20_songs['tempo'].std()

print(f'The average tempo is {avg_tempo}')
print(f'The standard deviation for tempo is {std_dev_tempo}')

The average tempo is 121.2259
The standard deviation for tempo is 22.91692531480195


In [332]:
#To provide easier use of this data, will round to 2 decimal places

avg_tempo_rounded = round(avg_tempo, 2)
std_dev_tempo_rounded = round(std_dev_tempo, 2)

print(f'The rounded average tempo is {avg_tempo_rounded}')
print(f'The rounded standard deviation for tempo is {std_dev_tempo_rounded}')

The rounded average tempo is 121.23
The rounded standard deviation for tempo is 22.92


In [316]:
# Create the histogram
fig_2 = px.histogram(
    top_20_songs,
    x='track_name',
    y='popularity',
    labels={'track_name': 'Track Name', 'popularity': 'Popularity of Track'},
    title='Popularity of a Track'
)
fig_2.update_layout(xaxis_tickangle=-45)
fig_2.update_yaxes(range=[80, 110])
# Display the plot
fig_2.show()

In [317]:
#create scatter plot
fig_3 = px.scatter(
    top_20_songs,
    x='track_name',
    y='popularity',
    labels={'track_name': 'Track Name', 'popularity': 'Popularity of Track'},
    title='Popularity of a Track'
)
fig_3.update_layout(xaxis_tickangle=-45)
fig_3.update_yaxes(range=[80, 150])
fig_3.show()

In [318]:
#calculate the average popularity of tracks grouped by their genre
popularity_by_genre = data.groupby('track_genre')['popularity'].mean().reset_index()
popularity_by_genre.columns = ['track_genre', 'average_popularity']
print(popularity_by_genre.head())

   track_genre  average_popularity
0     acoustic            42.43988
1     afrobeat            24.39900
2     alt-rock            33.94300
3  alternative            24.33700
4      ambient            44.19100


In [319]:
#create histogram
fig_4 = px.histogram(
    popularity_by_genre,
    x='track_genre',
    y='average_popularity',
    nbins=30,
    labels={'average_popularity': 'Average Popularity'},
    title='Distribution of Average Track Popularity by Genre'
)
fig_4.show()

In [320]:
#create scatter plot
fig_5 = px.scatter(
    popularity_by_genre,
    x='track_genre',
    y='average_popularity',
    labels={'track_genre': 'Genre', 'average_popularity': 'Average Popularity'},
    title='Average Popularity of Tracks by Genre'
)
fig_5.show()

In [None]:
#In conclusion factors to help predict the commercial success of future tracks are as follows:
# The average duration of the top 20 tracks are 3 minutes and 23 seconds, with a standard deviation of 35 seconds; meaning no song should be longer than 3 minutes and 58 seconds, or shorter than 2 minutes and 48 seconds.
# The average danceability score for the top 20 tracks is .71 out of 1, with a standard deviation of .15; meaning no song should have a danceability above .86 or lower than .56.
# The average energy score for the top 20 tracks is .67 our of 1, with a standard deviation of .12; meaning no song should have a higher energy level than .83 or lower than .51
# The most commonly used keys are G (occuring 20% of the time out of the selected population of tracks) and D (occuring 15% of the time out of the selected population of tracks); meaning there will be a higher chance of success if using these two keys when constructing a new track.
# The average tempo for these top 20 tracks is 121.23 beats per minute with a standard deviation of 22.92; meaning no song should be faster than 144.15 beats per minute or slower than 98.31 beats per minute


