In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
import pickle


In [4]:
df = pd.read_csv("data/extracted_features_df.csv")

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id_x,file_key,Mean Zero Crossing Rate,Mean RMS,Standard Dev. RMS,Skewness RMS,Mean Spectral Centroid,Mean Spectral Bandwidth,Mean Spectral Contrast,...,AS,AR,MR,MS,N,Age,Gender,Smoker,Lives,audio_filename_base
0,0,patient_001,MD_001_sup_Mit,0.011958,0.143142,0.034833,-1.096259,34.890196,40.113631,19.738395,...,1,1,1,1,0,35,M,1,U,MD_001_sup_Mit
1,1,patient_002,MR_002_sup_Mit,0.01221,0.165092,0.027566,-0.316046,33.460782,38.87217,21.016974,...,0,0,1,0,0,37,M,1,U,MR_002_sup_Mit
2,2,patient_003,MD_003_sup_Mit,0.014092,0.173574,0.043529,0.443779,37.845913,37.87252,19.996377,...,1,1,0,0,0,19,M,0,U,MD_003_sup_Mit
3,3,patient_004,MR_004_sup_Mit,0.0117,0.108412,0.038909,-0.287181,34.718984,45.096458,20.884205,...,0,0,1,0,0,21,M,0,R,MR_004_sup_Mit
4,4,patient_005,AS_005_sup_Mit,0.012714,0.148922,0.036316,0.305797,36.790153,47.651187,20.9483,...,1,0,0,0,0,45,M,0,R,AS_005_sup_Mit


In [5]:
#Renaming some columns
df = df.rename(columns = {"patient_id_x":"patient_id"})


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,file_key,Mean Zero Crossing Rate,Mean RMS,Standard Dev. RMS,Skewness RMS,Mean Spectral Centroid,Mean Spectral Bandwidth,Mean Spectral Contrast,...,AS,AR,MR,MS,N,Age,Gender,Smoker,Lives,audio_filename_base
0,0,patient_001,MD_001_sup_Mit,0.011958,0.143142,0.034833,-1.096259,34.890196,40.113631,19.738395,...,1,1,1,1,0,35,M,1,U,MD_001_sup_Mit
1,1,patient_002,MR_002_sup_Mit,0.01221,0.165092,0.027566,-0.316046,33.460782,38.87217,21.016974,...,0,0,1,0,0,37,M,1,U,MR_002_sup_Mit
2,2,patient_003,MD_003_sup_Mit,0.014092,0.173574,0.043529,0.443779,37.845913,37.87252,19.996377,...,1,1,0,0,0,19,M,0,U,MD_003_sup_Mit
3,3,patient_004,MR_004_sup_Mit,0.0117,0.108412,0.038909,-0.287181,34.718984,45.096458,20.884205,...,0,0,1,0,0,21,M,0,R,MR_004_sup_Mit
4,4,patient_005,AS_005_sup_Mit,0.012714,0.148922,0.036316,0.305797,36.790153,47.651187,20.9483,...,1,0,0,0,0,45,M,0,R,AS_005_sup_Mit


In [7]:
#Binary encoding data
non_numerics = df.select_dtypes(include = ['object'])
non_numerics

Unnamed: 0,patient_id,file_key,mfcc devation,Gender,Lives,audio_filename_base
0,patient_001,MD_001_sup_Mit,[5.3091073 5.950165 3.084269 2.7014892 3.689...,M,U,MD_001_sup_Mit
1,patient_002,MR_002_sup_Mit,[37.945606 7.4402657 10.91368 8.897607...,M,U,MR_002_sup_Mit
2,patient_003,MD_003_sup_Mit,[18.070349 5.320191 6.4636636 5.592787...,M,U,MD_003_sup_Mit
3,patient_004,MR_004_sup_Mit,[7.723736 9.400514 7.1840897 4.756417 3.603...,M,R,MR_004_sup_Mit
4,patient_005,AS_005_sup_Mit,[29.825611 11.928112 10.3983345 6.070464 ...,M,R,AS_005_sup_Mit
...,...,...,...,...,...,...
858,patient_105,N_105_sit_Aor,[37.81651 14.35403 9.865765 5.9906583 ...,M,U,N_105_sit_Aor
859,patient_106,N_106_sit_Aor,[16.249163 6.0424666 7.0737386 4.5677705 ...,F,U,N_106_sit_Aor
860,patient_107,N_107_sit_Aor,[5.785018 6.274968 2.5343313 3.94651 5.539...,F,U,N_107_sit_Aor
861,patient_108,N_108_sit_Aor,[42.99479 12.664107 13.271154 8.355676 ...,F,U,N_108_sit_Aor


In [8]:
#Checking the number of columns and rows

print(f'The number of rows a in this dataframe are {df.shape[0]} and number of columns is {df.shape[1]}')

The number of rows a in this dataframe are 863 and number of columns is 58


In [10]:
#Replacing alphabetic categorical data with numeric binary values
'''
Lives - U(Urban) = 1
      - R (Rural) = 0
Gender - M(Male) = 1
        -F(Female) = 0
'''


#Lives
df['Lives'].replace('U',1,inplace=True)
df['Lives'].replace('R',0,inplace= True)
#Gender
df['Gender'].replace('M',1,inplace=True)
df['Gender'].replace('F',0,inplace= True)
df

Unnamed: 0.1,Unnamed: 0,patient_id,file_key,Mean Zero Crossing Rate,Mean RMS,Standard Dev. RMS,Skewness RMS,Mean Spectral Centroid,Mean Spectral Bandwidth,Mean Spectral Contrast,...,AS,AR,MR,MS,N,Age,Gender,Smoker,Lives,audio_filename_base
0,0,patient_001,MD_001_sup_Mit,0.011958,0.143142,0.034833,-1.096259,34.890196,40.113631,19.738395,...,1,1,1,1,0,35,1,1,1,MD_001_sup_Mit
1,1,patient_002,MR_002_sup_Mit,0.012210,0.165092,0.027566,-0.316046,33.460782,38.872170,21.016974,...,0,0,1,0,0,37,1,1,1,MR_002_sup_Mit
2,2,patient_003,MD_003_sup_Mit,0.014092,0.173574,0.043529,0.443779,37.845913,37.872520,19.996377,...,1,1,0,0,0,19,1,0,1,MD_003_sup_Mit
3,3,patient_004,MR_004_sup_Mit,0.011700,0.108412,0.038909,-0.287181,34.718984,45.096458,20.884205,...,0,0,1,0,0,21,1,0,0,MR_004_sup_Mit
4,4,patient_005,AS_005_sup_Mit,0.012714,0.148922,0.036316,0.305797,36.790153,47.651187,20.948300,...,1,0,0,0,0,45,1,0,0,AS_005_sup_Mit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
858,858,patient_105,N_105_sit_Aor,0.011504,0.187178,0.032507,0.051966,35.285902,46.440528,21.716723,...,0,0,0,0,1,22,1,0,1,N_105_sit_Aor
859,859,patient_106,N_106_sit_Aor,0.010419,0.203321,0.034671,0.125428,34.521189,48.309507,21.559607,...,0,0,0,0,1,20,0,0,1,N_106_sit_Aor
860,860,patient_107,N_107_sit_Aor,0.009464,0.166443,0.046930,-0.038828,33.521768,45.466764,21.750754,...,0,0,0,0,1,20,0,0,1,N_107_sit_Aor
861,861,patient_108,N_108_sit_Aor,0.003866,0.262939,0.042581,0.718949,31.699348,48.340479,22.133920,...,0,0,0,0,1,20,0,0,1,N_108_sit_Aor


In [11]:
mfcc_deviation_df = df["mfcc devation"].apply(pd.Series)
mfcc_deviation_df.columns = [f"mfcc_dev_{i}" for i in mfcc_deviation_df.columns]
df = df.drop(columns=["mfcc devation"]).join(mfcc_deviation_df)
df = df.drop(columns = "mfcc_dev_0")


In [12]:
#dropping other non-numeric values
df.drop([ "file_key","audio_filename_base","patient_id"],axis=1,inplace=True)



In [13]:
#Checking for missing values
df.isnull().sum()



Unnamed: 0                          0
Mean Zero Crossing Rate             0
Mean RMS                            0
Standard Dev. RMS                   0
Skewness RMS                        0
Mean Spectral Centroid              0
Mean Spectral Bandwidth             0
Mean Spectral Contrast              0
mfcc length                         0
mean mel spectogram                 0
mel spectrogram deviation           0
CQT Mean                            0
CQT Std                             0
CQT Skew                            0
loudness_sma3_amean                 2
loudness_sma3_stddevNorm            2
loudness_sma3_percentile20.0        2
loudness_sma3_percentile50.0        2
loudness_sma3_percentile80.0        2
loudness_sma3_pctlrange0-2          2
loudness_sma3_meanRisingSlope       2
loudness_sma3_stddevRisingSlope     2
loudness_sma3_meanFallingSlope      2
loudness_sma3_stddevFallingSlope    2
spectralFlux_sma3_amean             2
spectralFlux_sma3_stddevNorm        2
mfcc1_sma3_a

In [14]:
df = df.dropna()
df.isnull().sum()


Unnamed: 0                          0
Mean Zero Crossing Rate             0
Mean RMS                            0
Standard Dev. RMS                   0
Skewness RMS                        0
Mean Spectral Centroid              0
Mean Spectral Bandwidth             0
Mean Spectral Contrast              0
mfcc length                         0
mean mel spectogram                 0
mel spectrogram deviation           0
CQT Mean                            0
CQT Std                             0
CQT Skew                            0
loudness_sma3_amean                 0
loudness_sma3_stddevNorm            0
loudness_sma3_percentile20.0        0
loudness_sma3_percentile50.0        0
loudness_sma3_percentile80.0        0
loudness_sma3_pctlrange0-2          0
loudness_sma3_meanRisingSlope       0
loudness_sma3_stddevRisingSlope     0
loudness_sma3_meanFallingSlope      0
loudness_sma3_stddevFallingSlope    0
spectralFlux_sma3_amean             0
spectralFlux_sma3_stddevNorm        0
mfcc1_sma3_a

In [15]:
print(f'The number of rows a in this dataframe are {df.shape[0]} and number of columns is {df.shape[1]}.')

The number of rows a in this dataframe are 861 and number of columns is 54.


In [16]:
#labels
y = df[['AS', 'AR',
       'MR', 'MS', 'N']]
#features
X = df.drop(['AS','MR',"AR","MS","N"],axis = 'columns')

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1234,test_size = 0.3)



In [18]:
rf_model = RandomForestClassifier(n_estimators = 100,
                                criterion = 'entropy',
                                 max_depth = 32 )

rf_model.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'entropy'
,max_depth,32
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
df.to_csv('output.csv')
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

In [None]:
y_rf_pred = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test,y_rf_pred )

# YOUR CODE HERE
rf_r2 = r2_score(y_test,y_rf_pred)

print('[RF] Root Mean Squared Error: {0}'.format(rf_rmse))
print('[RF] R2: {0}'.format(rf_r2)) 


rf_accuracy = accuracy_score(y_test,y_rf_pred)

print(f'Accuracy of our Random Forest model : {rf_accuracy}')

f1 = f1_score(y_test, y_rf_pred,average='weighted')
print(f"F1 Score: {f1}")



[RF] Root Mean Squared Error: 0.2
[RF] R2: 0.1295373402488784
Accuracy of our linear regression model : 0.444015444015444
F1 Score: 0.6517123863831307


In [20]:
#hyperparameter tuning FOR RF

print('Running Grid Search...')


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}
grid = GridSearchCV(rf_model, param_grid, cv=5)

grid_search = grid.fit(X_train,y_train)
print('Done')

Running Grid Search...
Done


In [21]:
rf_model_best = grid_search.best_estimator_

rf_model_best

0,1,2
,n_estimators,200
,criterion,'entropy'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
filename = 'best_rf_model.pkl'

with open(filename, 'wb') as file:
    pickle.dump(rf_model_best, file)