## Import Datasets

In [3]:
import pandas as pd

general_df = pd.DataFrame()

# Wanted Song Features:

# ID                    The Spotify ID for the track.
# Name                  The name of the track.
# Artist                The name of the artist.
# Energy-	            A measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.
# Tempo-                The speed of a track, measured in beats per minute (BPM).
# Danceability-	        A score describing how suitable a track is for dancing based on tempo, rhythm stability, beat strength and overall regularity.
# Loudness-	            The overall loudness of a track in decibels (dB). Higher values indicate louder tracks overall.
# Liveness-	            The likelihood of a track being performed live. Higher values suggest more audience presence.
# Valence-	            The overall musical positiveness(emotion) of a track. High valence sounds happy; low valence sounds sad or angry.
# Speechiness-	        Measures the presence of spoken words.
# Instrumentalness-	    The likelihood a track contains no vocals. Values closer to 1.0 suggest solely instrumental tracks.
# Mode	                Indicates the modality of the track.
# Key	                The musical key, represented as an integer from 0 to 11, mapping to standard Pitch class notation.
# Duration_ms	        The length of the track in milliseconds.
# Acousticness-	        A confidence measure of whether a track is acoustic(1) or not(0).

df_1 = pd.read_csv('../data/tracks_features.csv') # Taken from: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

# Song Features: ALL

df_2 = pd.read_csv('../data/dataset.csv') # Taken from: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

# Song Features: ALL

df_3 = pd.read_csv('../data/high_popularity_spotify_data.csv') # Taken from: https://www.kaggle.com/datasets/solomonameh/spotify-music-dataset    

# Song Features: ALL

df_4 = pd.read_csv('../data/data.csv') # Taken from: https://www.kaggle.com/datasets/ektanegi/spotifydata-19212020

# Song Features: ALL

# df_5 = pd.read_csv('../data/genres_v2.csv') # Taken from: https://www.kaggle.com/datasets/mrmorj/dataset-of-songs-in-spotify

# Song Features: NO Artist Name

## Present Datasets

### Dataset 1

In [4]:
# df_1.head()

In [5]:
df_1 = df_1.loc[:, ['id', 'name', 'artists', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
df_1.head()

Unnamed: 0,id,name,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,['Rage Against The Machine'],0.47,0.978,-5.399,0.0727,0.0261,1.1e-05,0.356,0.503,117.906
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,['Rage Against The Machine'],0.599,0.957,-5.764,0.188,0.0129,7.1e-05,0.155,0.489,103.68
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,['Rage Against The Machine'],0.315,0.97,-5.424,0.483,0.0234,2e-06,0.122,0.37,149.749
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,['Rage Against The Machine'],0.44,0.967,-5.83,0.237,0.163,4e-06,0.121,0.574,96.752
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,['Rage Against The Machine'],0.426,0.929,-6.729,0.0701,0.00162,0.105,0.0789,0.539,127.059


### Dataset 2

In [6]:
# df_2.head()

In [7]:
df_2 = df_2.rename(columns={'track_id': 'id', 'track_name': 'name'})
df_2 = df_2.loc[:, ['id', 'name', 'artists', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

df_2["artists"] = df_2["artists"].apply(
    lambda x: [artist.strip() for artist in x.split(";")] if isinstance(x, str) else x
)

df_2.head()

Unnamed: 0,id,name,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,5SuOikwiRyPMVoIQDJUgSV,Comedy,[Gen Hoshino],0.676,0.461,-6.746,0.143,0.0322,1e-06,0.358,0.715,87.917
1,4qPNDBW1i3p13qLCt0Ki3A,Ghost - Acoustic,[Ben Woodward],0.42,0.166,-17.235,0.0763,0.924,6e-06,0.101,0.267,77.489
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,"[Ingrid Michaelson, ZAYN]",0.438,0.359,-9.734,0.0557,0.21,0.0,0.117,0.12,76.332
3,6lfxq3CG4xtTiEg7opyCyx,Can't Help Falling In Love,[Kina Grannis],0.266,0.0596,-18.515,0.0363,0.905,7.1e-05,0.132,0.143,181.74
4,5vjLSffimiIP26QG5WcN2K,Hold On,[Chord Overstreet],0.618,0.443,-9.681,0.0526,0.469,0.0,0.0829,0.167,119.949


### Dataset 3

In [8]:
# df_3.head()

In [9]:
df_3 = df_3.rename(columns={'track_name': 'name', 'track_artist': 'artists'})

df_3 = df_3.loc[:, ['id', 'name', 'artists', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

df_3["artists"] = df_3["artists"].apply(
    lambda x: [artist.strip() for artist in x.split(",")] if isinstance(x, str) else x
)

df_3.head()

Unnamed: 0,id,name,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,2plbrEY59IikOBgBGLjaoe,Die With A Smile,"[Lady Gaga, Bruno Mars]",0.521,0.592,-7.777,0.0304,0.308,0.0,0.122,0.535,157.969
1,6dOtVTDdiauQNBQEDOtlAB,BIRDS OF A FEATHER,[Billie Eilish],0.747,0.507,-10.171,0.0358,0.2,0.0608,0.117,0.438,104.978
2,7ne4VBA60CxGM75vw0EYad,That’s So True,[Gracie Abrams],0.554,0.808,-4.169,0.0368,0.214,0.0,0.159,0.372,108.548
3,1d7Ptw3qYcfpdLNL5REhtJ,Taste,[Sabrina Carpenter],0.67,0.91,-4.07,0.0634,0.0939,0.0,0.304,0.786,112.966
4,5vNRhkKd0yEAg8suGBpjeY,APT.,"[ROSÉ, Bruno Mars]",0.777,0.783,-4.477,0.26,0.0283,0.0,0.355,0.939,149.027


### Dataset 4

In [10]:
# df_4.head()

In [11]:
df_4 = df_4.loc[:, ['id', 'name', 'artists', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

df_4.head()

Unnamed: 0,id,name,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,6KbQ3uYMLKb5jDxLF7wYDD,Singende Bataillone 1. Teil,['Carl Woitschach'],0.708,0.195,-12.428,0.0506,0.995,0.563,0.151,0.779,118.469
1,6KuQTIu1KoTTkLXKrwlLPV,"Fantasiestücke, Op. 111: Più tosto lento","['Robert Schumann', 'Vladimir Horowitz']",0.379,0.0135,-28.454,0.0462,0.994,0.901,0.0763,0.0767,83.972
2,6L63VW0PibdM1HDSBoqnoM,Chapter 1.18 - Zamek kaniowski,['Seweryn Goszczyński'],0.749,0.22,-19.924,0.929,0.604,0.0,0.119,0.88,107.177
3,6M94FkXd15sOAOQYRnWPN8,Bebamos Juntos - Instrumental (Remasterizado),['Francisco Canaro'],0.781,0.13,-14.734,0.0926,0.995,0.887,0.111,0.72,108.003
4,6N6tiFZ9vLTSOIxkj8qKrd,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['Frédéric Chopin', 'Vladimir Horowitz']",0.21,0.204,-16.829,0.0424,0.99,0.908,0.098,0.0693,62.149


### Dataset Lengths

In [12]:
print("df_1: ", len(df_1), "entries")
print("df_2: ", len(df_2), "entries")
print("df_3: ", len(df_3), "entries")
print("df_4: ", len(df_4), "entries")
print("")
print("Sum: ", len(df_1) + len(df_2) + len(df_3) + len(df_4), "entries")

df_1:  1204025 entries
df_2:  114000 entries
df_3:  1686 entries
df_4:  169909 entries

Sum:  1489620 entries


## Build Dataset for SQLite Database

### Remove Duplicates

In [13]:
general_df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

general_df = general_df.drop_duplicates(subset=['id'], keep='first')

print("Length without duplicates: ", len(general_df), "entries")

Length without duplicates:  1430573 entries


### Create URLs for Embeds

In [14]:
embed_urls = []

for i in general_df["id"]:
    embed_urls.append(f"https://open.spotify.com/embed/track/{i}?utm_source=generator")

In [15]:
general_df["embed_urls"] = embed_urls

In [16]:
len(general_df), len(embed_urls)

(1430573, 1430573)

In [17]:
general_df.columns

Index(['id', 'name', 'artists', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'embed_urls'],
      dtype='object')

In [18]:
general_df.head(10)

Unnamed: 0,id,name,artists,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,embed_urls
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,['Rage Against The Machine'],0.47,0.978,-5.399,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,https://open.spotify.com/embed/track/7lmeHLHBe...
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,['Rage Against The Machine'],0.599,0.957,-5.764,0.188,0.0129,7.1e-05,0.155,0.489,103.68,https://open.spotify.com/embed/track/1wsRitfRR...
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,['Rage Against The Machine'],0.315,0.97,-5.424,0.483,0.0234,2e-06,0.122,0.37,149.749,https://open.spotify.com/embed/track/1hR0fIFK2...
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,['Rage Against The Machine'],0.44,0.967,-5.83,0.237,0.163,4e-06,0.121,0.574,96.752,https://open.spotify.com/embed/track/2lbASgTSo...
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,['Rage Against The Machine'],0.426,0.929,-6.729,0.0701,0.00162,0.105,0.0789,0.539,127.059,https://open.spotify.com/embed/track/1MQTmpYOZ...
5,2LXPNLSMAauNJfnC58lSqY,Born of a Broken Man,['Rage Against The Machine'],0.298,0.848,-5.947,0.0727,0.0538,0.00152,0.201,0.194,148.282,https://open.spotify.com/embed/track/2LXPNLSMA...
6,3moeHk8eIajvUEzVocXukf,Born As Ghosts,['Rage Against The Machine'],0.417,0.976,-6.032,0.175,0.000427,0.000134,0.107,0.483,90.395,https://open.spotify.com/embed/track/3moeHk8eI...
7,4llunZfVXv3NvUzXVB3VVL,Maria,['Rage Against The Machine'],0.277,0.873,-6.571,0.0883,0.00694,5.4e-05,0.188,0.618,172.848,https://open.spotify.com/embed/track/4llunZfVX...
8,21Mq0NzFoVRvOmLTOnJjng,Voice of the Voiceless,['Rage Against The Machine'],0.441,0.882,-7.363,0.044,0.0195,0.00684,0.15,0.418,83.371,https://open.spotify.com/embed/track/21Mq0NzFo...
9,6s2FgJbnnMwFTpWJZzvb6z,New Millennium Homes,['Rage Against The Machine'],0.448,0.861,-6.12,0.0676,0.00306,0.0,0.0987,0.761,92.777,https://open.spotify.com/embed/track/6s2FgJbnn...


In [19]:
general_df.to_csv('../data/all_data.csv', index=False)

In [20]:
len(general_df)

1430573

In [21]:
# get min value from column tempo

min_tempo = general_df['loudness'].min()

min_tempo

np.float64(-60.0)