In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import os

In [2]:
# Load the CSV files into a Pandas DataFrame
spotify_df = pd.read_csv(os.path.join('..','Resouces','genres_v2.csv'), delimiter=',', low_memory=False) 
long_df = pd.read_csv(os.path.join('..','Resouces','dataset-long.csv'), delimiter=',', low_memory=False, index_col=0)

In [3]:
# Columns of genres_v2.csv
spotify_df.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature', 'genre', 'song_name', 'Unnamed: 0', 'title'],
      dtype='object')

In [4]:
# Columns of dataset-long.csv
long_df.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

In [5]:
spotify_df['genre'].value_counts()

Underground Rap    5875
Dark Trap          4578
Hiphop             3028
trance             2999
trap               2987
techhouse          2975
dnb                2966
psytrance          2961
techno             2956
hardstyle          2936
RnB                2099
Trap Metal         1956
Rap                1848
Emo                1680
Pop                 461
Name: genre, dtype: int64

In [6]:
# dataset-long.csv has 1000 songs for each genre 
long_df['track_genre'].value_counts()

acoustic             1000
punk-rock            1000
progressive-house    1000
power-pop            1000
pop                  1000
                     ... 
folk                 1000
emo                  1000
electronic           1000
electro              1000
world-music          1000
Name: track_genre, Length: 114, dtype: int64

In [7]:
# Drop duplicate songs by track_id and keep only the first
long_df_dd = long_df.drop_duplicates(subset='track_id', keep='first').reset_index(drop=True)

In [8]:
# Check number of elements 
long_df_dd.shape

(89741, 20)

In [9]:
# Replace 0s in Popularity with 1
long_df_dd['popularity'].replace(0, 1, inplace=True)

In [10]:
long_df_dd.dtypes

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [11]:
# long_df_dd['popularity'] = long_df_dd['popularity'].astype('int')

In [12]:
# Find median of each genre then sort by popularity 
filter_df = long_df_dd.groupby('track_genre').median('popularity').sort_values('popularity', ascending=False)
filter_df

Unnamed: 0_level_0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
track_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
pop,65.0,211995.5,0.0,0.6005,0.6160,6.0,-7.2255,0.5,0.04665,0.41200,0.000000,0.1110,0.4655,117.5995,4.0
metal,63.0,235297.5,0.0,0.4970,0.8885,5.0,-4.5050,1.0,0.05810,0.00364,0.000032,0.1590,0.3955,130.0390,4.0
k-pop,61.0,232150.0,0.0,0.6500,0.7120,5.0,-5.9020,0.0,0.05560,0.19300,0.000000,0.1270,0.5860,119.8720,4.0
pop-film,60.0,283070.0,0.0,0.5990,0.6060,5.0,-7.5660,1.0,0.04070,0.44500,0.000003,0.1220,0.5350,114.0580,4.0
hip-hop,59.0,200547.0,0.0,0.7405,0.7080,6.0,-5.8910,0.0,0.09370,0.11500,0.000000,0.1290,0.5525,108.9660,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
romance,1.0,189106.5,0.0,0.4210,0.2745,5.0,-12.6955,0.0,0.04640,0.95000,0.000099,0.1375,0.3570,106.1775,4.0
jazz,1.0,162680.0,0.0,0.4840,0.2970,5.0,-11.7810,1.0,0.03700,0.81200,0.000006,0.1205,0.5240,117.0530,4.0
rock,1.0,217906.0,0.0,0.5590,0.6795,5.0,-5.8335,1.0,0.03610,0.09155,0.000017,0.1235,0.4740,117.9160,4.0
latin,1.0,208333.0,0.0,0.7500,0.7370,6.0,-4.8100,1.0,0.07440,0.13200,0.000000,0.1230,0.6470,105.0160,4.0


In [13]:
# Top 10 Popularity Genre
filter_df = long_df_dd.groupby('track_genre').median('popularity').sort_values('popularity', ascending=False).head(10).index.tolist()
filter_df

['pop',
 'metal',
 'k-pop',
 'pop-film',
 'hip-hop',
 'chill',
 'edm',
 'singer-songwriter',
 'grunge',
 'british']

In [14]:
# Create a new dataframe with conditions above - Top 10 Genres 
long_df_filtered = long_df_dd[long_df_dd['track_genre'].isin(filter_df)].reset_index(drop=True)

In [15]:
long_df_filtered.sort_values('popularity', ascending=False)

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
4022,2tTmW7RDtMQtBk7m2rYeSw,Bizarrap;Quevedo,"Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,2,-5.548,1,0.0440,0.0125,0.033000,0.2300,0.550,128.033,4,hip-hop
6252,4LRPiXqCikLlN15c3yImP7,Harry Styles,As It Was,As It Was,95,167303,False,0.520,0.731,6,-5.338,0,0.0557,0.3420,0.001010,0.3110,0.662,173.930,4,pop
6273,6xGruZOHLs39ZbVccQTuPZ,Joji,Glimpse of Us,Glimpse of Us,94,233456,False,0.440,0.317,8,-9.258,1,0.0531,0.8910,0.000005,0.1410,0.268,169.914,3,pop
912,3JvKfv6T31zO0ini8iNItO,Tom Odell,Long Way Down (Deluxe),Another Love,93,244360,True,0.445,0.537,4,-8.532,0,0.0400,0.6950,0.000017,0.0944,0.131,122.769,4,chill
6271,0WtM2NBVQNNJLh6scP13H8,Rema;Selena Gomez,Calm Down (with Selena Gomez),Calm Down (with Selena Gomez),92,239317,False,0.801,0.806,11,-5.206,1,0.0381,0.3820,0.000669,0.1140,0.802,106.999,4,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6336,7s2OAgMDtuwq8F3DaBhunU,Ed Sheeran,Music for Rainy Days,Autumn Leaves,1,200755,False,0.555,0.257,10,-12.637,1,0.0299,0.8920,0.000001,0.1240,0.431,89.013,4,pop
6338,1euRAecw6xEwkt6lPXMbFP,Ed Sheeran,Today's Party Hits,Shivers,1,207853,False,0.788,0.859,2,-2.724,1,0.0856,0.2810,0.000000,0.0424,0.822,141.020,4,pop
6340,4JWlQ6skcm0SuztTiapWeo,Ed Sheeran,Coffee Moment,How Would You Feel (Paean),1,280533,False,0.618,0.439,9,-5.630,1,0.0269,0.4240,0.000000,0.1270,0.242,139.983,4,pop
6342,0MbP1Nr6DLXB7HFXD7QVwU,Ed Sheeran,20's Rock,Visiting Hours,1,215506,False,0.471,0.396,8,-6.654,1,0.0336,0.7700,0.000000,0.0729,0.263,149.609,4,pop


In [16]:
long_df_filtered.to_csv(os.path.join('..', 'Outputs', 'cleaned_data.csv'),index=False)

In [17]:
long_df_filtered.to_json(os.path.join('..', 'Outputs', 'cleaned_data.json'),orient='records')

In [18]:
import json

In [19]:
js = list(json.load(open(os.path.join('..', 'Outputs', 'cleaned_data.json'))))

In [20]:
for i in js[:1]:
    print(i)

{'track_id': '0DuWDLjriRPjDRoPgaCslY', 'artists': 'Adele', 'album_name': '25', 'track_name': 'Love In The Dark', 'popularity': 78, 'duration_ms': 285935, 'explicit': False, 'danceability': 0.331, 'energy': 0.341, 'key': 9, 'loudness': -6.057, 'mode': 0, 'speechiness': 0.0309, 'acousticness': 0.528, 'instrumentalness': 0.0, 'liveness': 0.109, 'valence': 0.152, 'tempo': 109.821, 'time_signature': 4, 'track_genre': 'british'}


In [25]:
js[2460]

{'track_id': '1kwHrBWAlucOE57lWugotk',
 'artists': 'Felix Jaehn;Zoe Wees',
 'album_name': 'Do It Better (feat. Zoe Wees)',
 'track_name': 'Do It Better (feat. Zoe Wees)',
 'popularity': 76,
 'duration_ms': 170045,
 'explicit': False,
 'danceability': 0.677,
 'energy': 0.804,
 'key': 1,
 'loudness': -5.337,
 'mode': 0,
 'speechiness': 0.0677,
 'acousticness': 0.0818,
 'instrumentalness': 1.92e-05,
 'liveness': 0.158,
 'valence': 0.41,
 'tempo': 122.111,
 'time_signature': 4,
 'track_genre': 'edm'}

In [26]:
js[2461]

{'track_id': '1JPKjo7oZkYdjh0x2x2DVf',
 'artists': 'twocolors;Kairos Grove',
 'album_name': 'EDM Gaming Music Autumn/Winter 2022',
 'track_name': 'BACK TO LIFE - TC/TC',
 'popularity': 1,
 'duration_ms': 180000,
 'explicit': False,
 'danceability': 0.589,
 'energy': 0.838,
 'key': 10,
 'loudness': -5.945,
 'mode': 0,
 'speechiness': 0.0417,
 'acousticness': 0.00587,
 'instrumentalness': 0.228,
 'liveness': 0.379,
 'valence': 0.322,
 'tempo': 126.113,
 'time_signature': 4,
 'track_genre': 'edm'}