## Task: Clean data to be used for our modeling

Ex: Dropping null values, removing unnecessary columns, removing outliers, and potentially fixing incorrectly formatted data.

In [1]:
import pandas as pd
import matplotlib as plt 
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
audio_df = pd.read_csv('../data/train.csv')

In [3]:
audio_df.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,sadness,feelings,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.380299,0.117175,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.250668,0.00277,0.323794,0.00277,0.00277,0.00277,0.00277,0.225422,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.001548,0.001548,0.001548,0.12925,0.001548,0.001548,0.225889,0.001548,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.00135,0.00135,0.00135,0.00135,0.00135,0.029755,0.0688,0.00135,romantic,1.0


In [4]:
# drop unecessary columns (non-numerical columns, columns with no significant insights)

audio_clean_df = audio_df.drop(['Unnamed: 0','lyrics', 'artist_name','track_name','genre','topic'], axis = 1)


In [5]:
audio_clean_df.head()

Unnamed: 0,release_date,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,sadness,feelings,age
0,1950,95,0.000598,0.063746,0.000598,0.000598,0.000598,0.048857,0.017104,0.263751,0.000598,0.039288,0.000598,0.000598,0.000598,0.380299,0.117175,1.0
1,1950,51,0.035537,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,0.001284,0.118034,0.001284,0.212681,0.051124,0.001284,0.001284,1.0
2,1950,24,0.00277,0.00277,0.00277,0.00277,0.00277,0.00277,0.158564,0.250668,0.00277,0.323794,0.00277,0.00277,0.00277,0.00277,0.225422,1.0
3,1950,54,0.048249,0.001548,0.001548,0.001548,0.0215,0.001548,0.411536,0.001548,0.001548,0.001548,0.12925,0.001548,0.001548,0.225889,0.001548,1.0
4,1950,48,0.00135,0.00135,0.417772,0.00135,0.00135,0.00135,0.46343,0.00135,0.00135,0.00135,0.00135,0.00135,0.029755,0.0688,0.00135,1.0


In [6]:
# scale data for each column data to be treated equally and not dominate clustering (max-min value: 1-10)

# setting minimum value of each column to 0 and rescale to 1-10 
audio_clean_df = (audio_clean_df - audio_clean_df.min())/(audio_clean_df.max()- audio_clean_df.min()) * 9 + 1 



In [7]:
# minumum and maximum value showcases successful scaling 

audio_clean_df.describe()

Unnamed: 0,release_date,len,dating,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,music,movement/places,light/visual perceptions,family/spiritual,sadness,feelings,age
count,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0,28362.0
mean,6.24865,4.274115,1.289413,2.082816,2.129363,1.527638,1.310161,1.276696,1.463072,2.064596,1.879092,1.562378,1.665151,1.656948,1.347746,2.184392,1.288313,4.75135
std,2.411347,1.901437,0.727969,1.638237,1.611479,1.034551,0.735991,0.693037,1.015025,1.526585,1.644976,1.16042,1.292113,1.207456,0.743438,1.661683,0.672808,2.411347
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,4.26087,2.863636,1.008794,1.00766,1.008223,1.006868,1.012826,1.010472,1.006604,1.011898,1.006926,1.006449,1.009999,1.009554,1.009796,1.007886,1.006609,2.565217
50%,6.347826,3.818182,1.016281,1.020373,1.05884,1.01535,1.023721,1.020057,1.014066,1.032635,1.013842,1.014354,1.019196,1.020635,1.019816,1.045669,1.013757,4.652174
75%,8.434783,5.181818,1.052239,2.762897,2.84866,1.605505,1.17567,1.074236,1.402102,2.837824,1.803002,1.515735,1.763324,1.863159,1.367556,3.154099,1.303544,6.73913
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [8]:
# save dataframe as a new csv file to be used in the next step

audio_clean_df.to_csv('../data/audio_cleaned.csv')