## Task: Clean data to be used for our modeling

Ex: Dropping null values, removing unnecessary columns, removing outliers, and potentially fixing incorrectly formatted data.

In [1]:
import pandas as pd
import matplotlib as plt 
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [22]:
audio_df = pd.read_csv('../data/tcc_ceds_music.csv')

In [23]:
audio_df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [24]:
# drop unecessary columns (non-numerical columns, columns with no significant insights)

audio_clean_df = audio_df.drop(['Unnamed: 0','lyrics', 'artist_name','track_name','genre','topic'], axis = 1)


In [25]:
audio_clean_df.head()

Unnamed: 0,release_date,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,...,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,age
0,1950,95,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,...,0.000598,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,1.0
1,1950,51,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,...,0.001284,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,1.0
2,1950,24,0.00277,0.00277,0.00277,0.00277,0.00277,0.00277,0.158564,0.250668,...,0.00277,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,1.0
3,1950,54,0.048249,0.001548,0.001548,0.001548,0.0215,0.001548,0.411536,0.001548,...,0.081132,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,1.0
4,1950,48,0.00135,0.00135,0.417772,0.00135,0.00135,0.00135,0.46343,0.00135,...,0.00135,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,1.0


In [26]:
# scale data for each column data to be treated equally and not dominate clustering (max-min value: 1-10)

# setting minimum value of each column to 0 and rescale to 1-10 
audio_clean_df = (audio_clean_df - audio_clean_df.min())/(audio_clean_df.max()- audio_clean_df.min()) * 9 + 1 



In [27]:
# minumum and maximum value showcases successful scaling 

audio_clean_df.describe()

Unnamed: 0,release_date,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,...,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,age
count,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,...,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0,28372.0
mean,6.24829,4.27402,1.28944,2.083047,2.129263,1.527926,1.310233,1.276701,1.463122,2.065003,...,1.420672,2.184279,1.288323,5.808684,6.987243,4.053111,1.722635,5.795774,6.128875,4.75171
std,2.411408,1.901356,0.728017,1.638472,1.611326,1.034841,0.736218,0.692994,1.01526,1.527166,...,0.885692,1.661624,0.672771,1.577759,0.975907,2.940429,1.906998,2.258747,2.199465,2.411408
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4.26087,2.863636,1.008794,1.00766,1.008223,1.006868,1.012826,1.010472,1.006604,1.011898,...,1.010454,1.007886,1.006609,4.712266,6.35828,1.308121,1.0,3.962284,4.423249,2.565217
50%,6.347826,3.818182,1.016281,1.020373,1.05884,1.01535,1.023721,1.020057,1.014066,1.032635,...,1.019849,1.045669,1.013757,5.856626,7.111453,3.033124,1.000767,5.854287,6.225107,4.652174
75%,8.434783,5.181818,1.052239,2.76354,2.84809,1.606102,1.175902,1.074236,1.402073,2.838166,...,1.398932,3.15408,1.303584,6.93193,7.741231,6.692767,1.084266,7.64427,7.954891,6.73913
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [28]:
# save dataframe as a new csv file to be used in the next step

audio_clean_df.to_csv('../data/audio_cleaned.csv')