In [1]:
#importing libraries
import pandas as pd
import numpy as np
from collections import Counter
from collections import defaultdict

from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectFromModel, RFE
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# data visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

In [2]:
df = pd.read_csv('data/data_full_upto_varThr.csv') #quello con anche il test set
df_train = df.loc[:1827,:].copy() #solo train set, si lavora su questo per via del bias che si andrebbe a creare
df_train.head()

Unnamed: 0,vocal_channel,emotion,emotional_intensity,statement,repetition,sex,frame_count,sum,skew,lag1_kur,zc_sum,mfcc_sum,mfcc_mean,mfcc_max,mfcc_q05,mfcc_q25,mfcc_q50,mfcc_q75,mfcc_q95,mfcc_q99,sc_sum,sc_mean,sc_std,sc_min,sc_max,sc_q01,sc_q05,sc_q25,sc_q50,sc_q75,sc_q95,sc_q99,sc_kur,stft_sum,sum_w1,kur_w1,skew_w1,lag1_kur_w1,lag1_skew_w1,zc_sum_w1,zc_q95_w1,mfcc_sum_w1,mfcc_mean_w1,mfcc_std_w1,mfcc_max_w1,mfcc_q05_w1,mfcc_q25_w1,mfcc_q50_w1,mfcc_q75_w1,mfcc_q95_w1,mfcc_q99_w1,sc_sum_w1,sc_std_w1,sc_min_w1,sc_q25_w1,sc_q75_w1,sc_q95_w1,sc_kur_w1,sc_skew_w1,stft_sum_w1,stft_kur_w1,sum_w2,kur_w2,zc_sum_w2,mfcc_min_w2,mfcc_max_w2,mfcc_q75_w2,mfcc_q95_w2,mfcc_kur_w2,sc_sum_w2,sc_mean_w2,sc_std_w2,sc_min_w2,sc_max_w2,sc_q05_w2,sc_q25_w2,sc_q50_w2,sc_kur_w2,stft_sum_w2,sum_w3,kur_w3,lag1_kur_w3,zc_sum_w3,mfcc_mean_w3,mfcc_q05_w3,mfcc_q95_w3,mfcc_q99_w3,mfcc_kur_w3,sc_sum_w3,sc_mean_w3,sc_std_w3,sc_min_w3,sc_max_w3,sc_q25_w3,sc_kur_w3,stft_sum_w3,sum_w4,kur_w4,skew_w4,lag1_kur_w4,lag1_skew_w4,zc_sum_w4,zc_q95_w4,mfcc_sum_w4,mfcc_mean_w4,mfcc_std_w4,mfcc_max_w4,mfcc_q25_w4,mfcc_q50_w4,mfcc_q95_w4,mfcc_q99_w4,sc_sum_w4,sc_mean_w4,sc_std_w4,sc_min_w4,sc_max_w4,sc_q25_w4,sc_q75_w4,sc_kur_w4,sc_skew_w4,stft_sum_w4,stft_kur_w4,stft_skew_w4
0,speech,neutral,normal,Kids are talking by the door,1st,M,0.106792,0.506722,0.784505,0.260775,0.129522,0.591481,0.423199,0.717587,0.635723,0.896695,0.479183,0.261387,0.447466,0.752627,0.655516,0.954851,0.882301,0.0,0.705391,0.385894,0.451166,0.560393,0.927656,0.951307,0.950086,0.655447,0.014217,0.528794,0.552984,0.572236,0.61801,0.699286,1.0,0.062556,0.0,0.445355,0.13357,0.933467,0.265192,0.374045,0.679413,0.329009,0.167593,0.191928,0.483503,0.80799,0.09293,0.975799,0.96534,0.999875,0.935376,0.692039,0.780737,0.853646,0.503824,0.553195,0.691411,0.224991,0.299631,0.628919,0.652833,0.509086,0.438077,0.210342,0.405315,0.593581,0.393472,0.528451,0.404438,0.385595,0.378612,0.087534,0.46663,0.564012,0.59097,0.486622,0.224149,0.591696,0.801812,0.374269,0.833184,0.481993,0.30751,0.365604,0.769611,0.31864,0.578369,0.291006,0.533887,0.553313,0.482849,0.290004,0.73854,0.108887,0.593791,0.448805,1.0,0.597521,0.216581,0.866067,0.159312,0.653604,0.488662,0.281894,0.187912,0.738302,0.964313,0.268862,0.0,0.901624,0.936516,0.976689,0.958815,0.102331,0.514047,0.988635,0.048224
1,speech,neutral,normal,Kids are talking by the door,2nd,M,0.116504,0.503248,0.807974,0.225783,0.148238,0.592678,0.436033,0.598435,0.506675,0.862466,0.479183,0.278588,0.470939,0.729264,0.624961,0.901829,0.812829,0.401039,0.45051,0.467991,0.45076,0.561798,0.799551,0.926962,0.943297,0.633036,0.013852,0.533221,0.600963,0.435323,0.230835,0.467376,0.617102,0.149082,1.0,0.573211,0.331577,0.81913,0.135787,0.516668,0.679413,0.522225,0.469625,0.271325,0.489737,0.764826,0.161923,0.628552,0.909069,0.989784,0.930195,0.638155,0.389996,0.867947,0.192102,0.45168,0.699709,0.198031,0.290148,0.550263,0.603532,0.441019,0.457699,0.226723,0.42019,0.49544,0.525674,0.367284,0.493723,0.394864,0.414946,0.051492,0.519341,0.611536,0.543484,0.539834,0.235869,0.614794,0.852725,0.521718,0.839149,0.449225,0.255697,0.30349,0.658337,0.305612,0.450865,0.293301,0.552003,0.50709,0.510608,0.339563,0.407132,0.128112,0.592731,0.396224,0.0,0.555675,0.159624,0.898993,0.209908,0.632088,0.413982,0.253659,0.267796,0.74245,0.956018,0.268475,0.869244,0.90004,0.872086,0.990271,0.177813,0.526318,0.483277,0.095336,0.360443
2,speech,neutral,normal,Dogs are sitting by the door,1st,M,0.097081,0.507227,0.87244,0.269902,0.104711,0.602877,0.43213,0.667726,0.887036,0.896088,0.479183,0.23546,0.380469,0.801084,0.619848,0.921414,0.853955,0.0,0.459613,0.393383,0.43125,0.472504,0.911578,0.930051,0.947636,0.642306,0.013639,0.49587,0.599825,0.100512,0.41389,0.112541,0.527296,0.149221,1.0,0.602522,0.342404,0.821121,0.156563,0.832058,0.701664,0.558589,0.466631,0.278917,0.491881,0.754973,0.123756,0.799765,0.90529,0.993358,0.930973,0.494276,0.483352,0.846438,0.131463,0.564682,0.669414,0.130543,0.30145,0.670677,0.493035,0.394625,0.540126,0.250507,0.474156,0.781921,0.461177,0.536287,0.437559,0.2941,0.417184,0.060925,0.483079,0.487647,0.55563,0.432124,0.219039,0.612247,0.820192,0.53543,0.845064,0.451288,0.222941,0.28435,0.667947,0.303843,0.513897,0.302093,0.586855,0.497345,0.480531,0.2981,0.351252,0.13495,0.593133,0.36162,0.0,0.565339,0.144131,0.903513,0.192688,0.611607,0.382014,0.290503,0.23098,0.716463,0.949595,0.299826,0.0,0.904404,0.871181,0.988734,0.865314,0.24784,0.472904,0.764041,0.189619
3,speech,neutral,normal,Dogs are sitting by the door,2nd,M,0.06796,0.506144,1.0,0.264879,0.138251,0.635597,0.456018,0.584642,0.943993,0.892183,0.50633,0.28433,0.409318,0.731318,0.557552,0.877914,0.746318,0.471974,0.453229,0.459827,0.487427,0.537759,0.841966,0.855704,0.941764,0.63574,0.023,0.452635,0.544967,0.107666,0.298025,0.120222,0.526632,0.10142,0.0,0.602673,0.310534,0.800603,0.253886,0.80771,0.654612,0.432503,0.327505,0.238135,0.630466,0.670873,0.224682,0.706742,0.748957,0.987263,0.930475,0.05593,0.583187,0.79545,0.115819,0.576253,0.707298,0.154487,0.301954,0.582064,0.488696,0.385218,0.590455,0.291251,0.5569,0.874704,0.394148,0.554093,0.439597,0.309099,0.480095,0.052288,0.50578,0.467601,0.55477,0.448095,0.211274,0.62923,0.838101,0.533216,0.76032,0.441262,0.227556,0.303307,0.640368,0.375101,0.48004,0.332188,0.554301,0.557625,0.483335,0.292879,0.447403,0.101007,0.59342,0.492813,1.0,0.667524,0.295754,0.785652,0.247124,0.61481,0.450556,0.351858,0.281539,0.654814,0.894755,0.247404,0.871873,0.903784,0.807227,0.954028,0.1239,0.556567,0.445061,0.110588,0.316584
4,speech,calm,normal,Kids are talking by the door,1st,M,0.174752,0.505933,0.716012,0.234362,0.122601,0.523167,0.382682,0.676841,0.811804,0.852104,0.483519,0.278596,0.548329,0.747265,0.666717,0.875715,0.865877,0.0,0.598407,0.381077,0.449217,0.50594,0.733857,0.931529,0.949096,0.641768,0.008805,0.560357,0.600098,0.430345,0.242375,0.547145,0.546188,0.055005,0.0,0.42616,0.192911,0.903958,0.205817,0.777937,0.679413,0.373151,0.283648,0.253374,0.507731,0.809817,0.377889,0.0,0.940562,0.999831,0.932552,0.767877,0.249125,0.86377,0.615396,0.538091,0.710419,0.19608,0.216527,0.544756,0.67928,0.500819,0.419206,0.240252,0.404362,0.594653,0.349367,0.467232,0.424598,0.351926,0.447259,0.130634,0.510112,0.582873,0.54739,0.488616,0.257198,0.593002,0.86872,0.614799,0.772112,0.365163,0.207254,0.231431,0.524024,0.321571,0.347322,0.304247,0.544002,0.516669,0.523519,0.293666,0.682298,0.100182,0.593473,0.461552,1.0,0.570613,0.244579,0.840194,0.330613,0.636019,0.484027,0.29193,0.271583,0.757533,0.922996,0.299318,0.819687,0.904905,0.823335,0.976112,0.37105,0.530355,0.514943,0.542468,0.273444


In [3]:
# target variable moved to the end
targ = 'zc_sum' #choose the target variable
categoricals = ['vocal_channel','emotion','emotional_intensity','statement','repetition','sex']
non_targ = categoricals.copy()
try:
    non_targ.remove(targ)
except:
    pass
target_variable = df.pop(targ)
df[targ] = target_variable
print(non_targ)
df.head()

['vocal_channel', 'emotion', 'emotional_intensity', 'statement', 'repetition', 'sex']


  df[targ] = target_variable


Unnamed: 0,vocal_channel,emotion,emotional_intensity,statement,repetition,sex,frame_count,sum,skew,lag1_kur,mfcc_sum,mfcc_mean,mfcc_max,mfcc_q05,mfcc_q25,mfcc_q50,mfcc_q75,mfcc_q95,mfcc_q99,sc_sum,sc_mean,sc_std,sc_min,sc_max,sc_q01,sc_q05,sc_q25,sc_q50,sc_q75,sc_q95,sc_q99,sc_kur,stft_sum,sum_w1,kur_w1,skew_w1,lag1_kur_w1,lag1_skew_w1,zc_sum_w1,zc_q95_w1,mfcc_sum_w1,mfcc_mean_w1,mfcc_std_w1,mfcc_max_w1,mfcc_q05_w1,mfcc_q25_w1,mfcc_q50_w1,mfcc_q75_w1,mfcc_q95_w1,mfcc_q99_w1,sc_sum_w1,sc_std_w1,sc_min_w1,sc_q25_w1,sc_q75_w1,sc_q95_w1,sc_kur_w1,sc_skew_w1,stft_sum_w1,stft_kur_w1,sum_w2,kur_w2,zc_sum_w2,mfcc_min_w2,mfcc_max_w2,mfcc_q75_w2,mfcc_q95_w2,mfcc_kur_w2,sc_sum_w2,sc_mean_w2,sc_std_w2,sc_min_w2,sc_max_w2,sc_q05_w2,sc_q25_w2,sc_q50_w2,sc_kur_w2,stft_sum_w2,sum_w3,kur_w3,lag1_kur_w3,zc_sum_w3,mfcc_mean_w3,mfcc_q05_w3,mfcc_q95_w3,mfcc_q99_w3,mfcc_kur_w3,sc_sum_w3,sc_mean_w3,sc_std_w3,sc_min_w3,sc_max_w3,sc_q25_w3,sc_kur_w3,stft_sum_w3,sum_w4,kur_w4,skew_w4,lag1_kur_w4,lag1_skew_w4,zc_sum_w4,zc_q95_w4,mfcc_sum_w4,mfcc_mean_w4,mfcc_std_w4,mfcc_max_w4,mfcc_q25_w4,mfcc_q50_w4,mfcc_q95_w4,mfcc_q99_w4,sc_sum_w4,sc_mean_w4,sc_std_w4,sc_min_w4,sc_max_w4,sc_q25_w4,sc_q75_w4,sc_kur_w4,sc_skew_w4,stft_sum_w4,stft_kur_w4,stft_skew_w4,zc_sum
0,speech,neutral,normal,Kids are talking by the door,1st,M,0.106792,0.506722,0.784505,0.260775,0.591481,0.423199,0.717587,0.635723,0.896695,0.479183,0.261387,0.447466,0.752627,0.655516,0.954851,0.882301,0.0,0.705391,0.385894,0.451166,0.560393,0.927656,0.951307,0.950086,0.655447,0.014217,0.528794,0.552984,0.572236,0.61801,0.699286,1.0,0.062556,0.0,0.445355,0.13357,0.933467,0.265192,0.374045,0.679413,0.329009,0.167593,0.191928,0.483503,0.80799,0.09293,0.975799,0.96534,0.999875,0.935376,0.692039,0.780737,0.853646,0.503824,0.553195,0.691411,0.224991,0.299631,0.628919,0.652833,0.509086,0.438077,0.210342,0.405315,0.593581,0.393472,0.528451,0.404438,0.385595,0.378612,0.087534,0.46663,0.564012,0.59097,0.486622,0.224149,0.591696,0.801812,0.374269,0.833184,0.481993,0.30751,0.365604,0.769611,0.31864,0.578369,0.291006,0.533887,0.553313,0.482849,0.290004,0.73854,0.108887,0.593791,0.448805,1.0,0.597521,0.216581,0.866067,0.159312,0.653604,0.488662,0.281894,0.187912,0.738302,0.964313,0.268862,0.0,0.901624,0.936516,0.976689,0.958815,0.102331,0.514047,0.988635,0.048224,0.129522
1,speech,neutral,normal,Kids are talking by the door,2nd,M,0.116504,0.503248,0.807974,0.225783,0.592678,0.436033,0.598435,0.506675,0.862466,0.479183,0.278588,0.470939,0.729264,0.624961,0.901829,0.812829,0.401039,0.45051,0.467991,0.45076,0.561798,0.799551,0.926962,0.943297,0.633036,0.013852,0.533221,0.600963,0.435323,0.230835,0.467376,0.617102,0.149082,1.0,0.573211,0.331577,0.81913,0.135787,0.516668,0.679413,0.522225,0.469625,0.271325,0.489737,0.764826,0.161923,0.628552,0.909069,0.989784,0.930195,0.638155,0.389996,0.867947,0.192102,0.45168,0.699709,0.198031,0.290148,0.550263,0.603532,0.441019,0.457699,0.226723,0.42019,0.49544,0.525674,0.367284,0.493723,0.394864,0.414946,0.051492,0.519341,0.611536,0.543484,0.539834,0.235869,0.614794,0.852725,0.521718,0.839149,0.449225,0.255697,0.30349,0.658337,0.305612,0.450865,0.293301,0.552003,0.50709,0.510608,0.339563,0.407132,0.128112,0.592731,0.396224,0.0,0.555675,0.159624,0.898993,0.209908,0.632088,0.413982,0.253659,0.267796,0.74245,0.956018,0.268475,0.869244,0.90004,0.872086,0.990271,0.177813,0.526318,0.483277,0.095336,0.360443,0.148238
2,speech,neutral,normal,Dogs are sitting by the door,1st,M,0.097081,0.507227,0.87244,0.269902,0.602877,0.43213,0.667726,0.887036,0.896088,0.479183,0.23546,0.380469,0.801084,0.619848,0.921414,0.853955,0.0,0.459613,0.393383,0.43125,0.472504,0.911578,0.930051,0.947636,0.642306,0.013639,0.49587,0.599825,0.100512,0.41389,0.112541,0.527296,0.149221,1.0,0.602522,0.342404,0.821121,0.156563,0.832058,0.701664,0.558589,0.466631,0.278917,0.491881,0.754973,0.123756,0.799765,0.90529,0.993358,0.930973,0.494276,0.483352,0.846438,0.131463,0.564682,0.669414,0.130543,0.30145,0.670677,0.493035,0.394625,0.540126,0.250507,0.474156,0.781921,0.461177,0.536287,0.437559,0.2941,0.417184,0.060925,0.483079,0.487647,0.55563,0.432124,0.219039,0.612247,0.820192,0.53543,0.845064,0.451288,0.222941,0.28435,0.667947,0.303843,0.513897,0.302093,0.586855,0.497345,0.480531,0.2981,0.351252,0.13495,0.593133,0.36162,0.0,0.565339,0.144131,0.903513,0.192688,0.611607,0.382014,0.290503,0.23098,0.716463,0.949595,0.299826,0.0,0.904404,0.871181,0.988734,0.865314,0.24784,0.472904,0.764041,0.189619,0.104711
3,speech,neutral,normal,Dogs are sitting by the door,2nd,M,0.06796,0.506144,1.0,0.264879,0.635597,0.456018,0.584642,0.943993,0.892183,0.50633,0.28433,0.409318,0.731318,0.557552,0.877914,0.746318,0.471974,0.453229,0.459827,0.487427,0.537759,0.841966,0.855704,0.941764,0.63574,0.023,0.452635,0.544967,0.107666,0.298025,0.120222,0.526632,0.10142,0.0,0.602673,0.310534,0.800603,0.253886,0.80771,0.654612,0.432503,0.327505,0.238135,0.630466,0.670873,0.224682,0.706742,0.748957,0.987263,0.930475,0.05593,0.583187,0.79545,0.115819,0.576253,0.707298,0.154487,0.301954,0.582064,0.488696,0.385218,0.590455,0.291251,0.5569,0.874704,0.394148,0.554093,0.439597,0.309099,0.480095,0.052288,0.50578,0.467601,0.55477,0.448095,0.211274,0.62923,0.838101,0.533216,0.76032,0.441262,0.227556,0.303307,0.640368,0.375101,0.48004,0.332188,0.554301,0.557625,0.483335,0.292879,0.447403,0.101007,0.59342,0.492813,1.0,0.667524,0.295754,0.785652,0.247124,0.61481,0.450556,0.351858,0.281539,0.654814,0.894755,0.247404,0.871873,0.903784,0.807227,0.954028,0.1239,0.556567,0.445061,0.110588,0.316584,0.138251
4,speech,calm,normal,Kids are talking by the door,1st,M,0.174752,0.505933,0.716012,0.234362,0.523167,0.382682,0.676841,0.811804,0.852104,0.483519,0.278596,0.548329,0.747265,0.666717,0.875715,0.865877,0.0,0.598407,0.381077,0.449217,0.50594,0.733857,0.931529,0.949096,0.641768,0.008805,0.560357,0.600098,0.430345,0.242375,0.547145,0.546188,0.055005,0.0,0.42616,0.192911,0.903958,0.205817,0.777937,0.679413,0.373151,0.283648,0.253374,0.507731,0.809817,0.377889,0.0,0.940562,0.999831,0.932552,0.767877,0.249125,0.86377,0.615396,0.538091,0.710419,0.19608,0.216527,0.544756,0.67928,0.500819,0.419206,0.240252,0.404362,0.594653,0.349367,0.467232,0.424598,0.351926,0.447259,0.130634,0.510112,0.582873,0.54739,0.488616,0.257198,0.593002,0.86872,0.614799,0.772112,0.365163,0.207254,0.231431,0.524024,0.321571,0.347322,0.304247,0.544002,0.516669,0.523519,0.293666,0.682298,0.100182,0.593473,0.461552,1.0,0.570613,0.244579,0.840194,0.330613,0.636019,0.484027,0.29193,0.271583,0.757533,0.922996,0.299318,0.819687,0.904905,0.823335,0.976112,0.37105,0.530355,0.514943,0.542468,0.273444,0.122601


In [4]:
# One_Hot_Econder for categorical variables
df = pd.get_dummies(df, columns = non_targ)
count=0
# moving binarized vars to the beginning of the dataframe
for i in df.loc[:,'vocal_channel_song':].keys():
    temp=df.pop(i)
    df.insert(loc=count, column=i, value=temp)
    count=count+1

df.head()

Unnamed: 0,vocal_channel_song,vocal_channel_speech,emotion_angry,emotion_calm,emotion_disgust,emotion_fearful,emotion_happy,emotion_neutral,emotion_sad,emotion_surprised,emotional_intensity_normal,emotional_intensity_strong,statement_Dogs are sitting by the door,statement_Kids are talking by the door,repetition_1st,repetition_2nd,sex_F,sex_M,frame_count,sum,skew,lag1_kur,mfcc_sum,mfcc_mean,mfcc_max,mfcc_q05,mfcc_q25,mfcc_q50,mfcc_q75,mfcc_q95,mfcc_q99,sc_sum,sc_mean,sc_std,sc_min,sc_max,sc_q01,sc_q05,sc_q25,sc_q50,sc_q75,sc_q95,sc_q99,sc_kur,stft_sum,sum_w1,kur_w1,skew_w1,lag1_kur_w1,lag1_skew_w1,zc_sum_w1,zc_q95_w1,mfcc_sum_w1,mfcc_mean_w1,mfcc_std_w1,mfcc_max_w1,mfcc_q05_w1,mfcc_q25_w1,mfcc_q50_w1,mfcc_q75_w1,mfcc_q95_w1,mfcc_q99_w1,sc_sum_w1,sc_std_w1,sc_min_w1,sc_q25_w1,sc_q75_w1,sc_q95_w1,sc_kur_w1,sc_skew_w1,stft_sum_w1,stft_kur_w1,sum_w2,kur_w2,zc_sum_w2,mfcc_min_w2,mfcc_max_w2,mfcc_q75_w2,mfcc_q95_w2,mfcc_kur_w2,sc_sum_w2,sc_mean_w2,sc_std_w2,sc_min_w2,sc_max_w2,sc_q05_w2,sc_q25_w2,sc_q50_w2,sc_kur_w2,stft_sum_w2,sum_w3,kur_w3,lag1_kur_w3,zc_sum_w3,mfcc_mean_w3,mfcc_q05_w3,mfcc_q95_w3,mfcc_q99_w3,mfcc_kur_w3,sc_sum_w3,sc_mean_w3,sc_std_w3,sc_min_w3,sc_max_w3,sc_q25_w3,sc_kur_w3,stft_sum_w3,sum_w4,kur_w4,skew_w4,lag1_kur_w4,lag1_skew_w4,zc_sum_w4,zc_q95_w4,mfcc_sum_w4,mfcc_mean_w4,mfcc_std_w4,mfcc_max_w4,mfcc_q25_w4,mfcc_q50_w4,mfcc_q95_w4,mfcc_q99_w4,sc_sum_w4,sc_mean_w4,sc_std_w4,sc_min_w4,sc_max_w4,sc_q25_w4,sc_q75_w4,sc_kur_w4,sc_skew_w4,stft_sum_w4,stft_kur_w4,stft_skew_w4,zc_sum
0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0.106792,0.506722,0.784505,0.260775,0.591481,0.423199,0.717587,0.635723,0.896695,0.479183,0.261387,0.447466,0.752627,0.655516,0.954851,0.882301,0.0,0.705391,0.385894,0.451166,0.560393,0.927656,0.951307,0.950086,0.655447,0.014217,0.528794,0.552984,0.572236,0.61801,0.699286,1.0,0.062556,0.0,0.445355,0.13357,0.933467,0.265192,0.374045,0.679413,0.329009,0.167593,0.191928,0.483503,0.80799,0.09293,0.975799,0.96534,0.999875,0.935376,0.692039,0.780737,0.853646,0.503824,0.553195,0.691411,0.224991,0.299631,0.628919,0.652833,0.509086,0.438077,0.210342,0.405315,0.593581,0.393472,0.528451,0.404438,0.385595,0.378612,0.087534,0.46663,0.564012,0.59097,0.486622,0.224149,0.591696,0.801812,0.374269,0.833184,0.481993,0.30751,0.365604,0.769611,0.31864,0.578369,0.291006,0.533887,0.553313,0.482849,0.290004,0.73854,0.108887,0.593791,0.448805,1.0,0.597521,0.216581,0.866067,0.159312,0.653604,0.488662,0.281894,0.187912,0.738302,0.964313,0.268862,0.0,0.901624,0.936516,0.976689,0.958815,0.102331,0.514047,0.988635,0.048224,0.129522
1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,1,0.116504,0.503248,0.807974,0.225783,0.592678,0.436033,0.598435,0.506675,0.862466,0.479183,0.278588,0.470939,0.729264,0.624961,0.901829,0.812829,0.401039,0.45051,0.467991,0.45076,0.561798,0.799551,0.926962,0.943297,0.633036,0.013852,0.533221,0.600963,0.435323,0.230835,0.467376,0.617102,0.149082,1.0,0.573211,0.331577,0.81913,0.135787,0.516668,0.679413,0.522225,0.469625,0.271325,0.489737,0.764826,0.161923,0.628552,0.909069,0.989784,0.930195,0.638155,0.389996,0.867947,0.192102,0.45168,0.699709,0.198031,0.290148,0.550263,0.603532,0.441019,0.457699,0.226723,0.42019,0.49544,0.525674,0.367284,0.493723,0.394864,0.414946,0.051492,0.519341,0.611536,0.543484,0.539834,0.235869,0.614794,0.852725,0.521718,0.839149,0.449225,0.255697,0.30349,0.658337,0.305612,0.450865,0.293301,0.552003,0.50709,0.510608,0.339563,0.407132,0.128112,0.592731,0.396224,0.0,0.555675,0.159624,0.898993,0.209908,0.632088,0.413982,0.253659,0.267796,0.74245,0.956018,0.268475,0.869244,0.90004,0.872086,0.990271,0.177813,0.526318,0.483277,0.095336,0.360443,0.148238
2,0,1,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,1,0.097081,0.507227,0.87244,0.269902,0.602877,0.43213,0.667726,0.887036,0.896088,0.479183,0.23546,0.380469,0.801084,0.619848,0.921414,0.853955,0.0,0.459613,0.393383,0.43125,0.472504,0.911578,0.930051,0.947636,0.642306,0.013639,0.49587,0.599825,0.100512,0.41389,0.112541,0.527296,0.149221,1.0,0.602522,0.342404,0.821121,0.156563,0.832058,0.701664,0.558589,0.466631,0.278917,0.491881,0.754973,0.123756,0.799765,0.90529,0.993358,0.930973,0.494276,0.483352,0.846438,0.131463,0.564682,0.669414,0.130543,0.30145,0.670677,0.493035,0.394625,0.540126,0.250507,0.474156,0.781921,0.461177,0.536287,0.437559,0.2941,0.417184,0.060925,0.483079,0.487647,0.55563,0.432124,0.219039,0.612247,0.820192,0.53543,0.845064,0.451288,0.222941,0.28435,0.667947,0.303843,0.513897,0.302093,0.586855,0.497345,0.480531,0.2981,0.351252,0.13495,0.593133,0.36162,0.0,0.565339,0.144131,0.903513,0.192688,0.611607,0.382014,0.290503,0.23098,0.716463,0.949595,0.299826,0.0,0.904404,0.871181,0.988734,0.865314,0.24784,0.472904,0.764041,0.189619,0.104711
3,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0.06796,0.506144,1.0,0.264879,0.635597,0.456018,0.584642,0.943993,0.892183,0.50633,0.28433,0.409318,0.731318,0.557552,0.877914,0.746318,0.471974,0.453229,0.459827,0.487427,0.537759,0.841966,0.855704,0.941764,0.63574,0.023,0.452635,0.544967,0.107666,0.298025,0.120222,0.526632,0.10142,0.0,0.602673,0.310534,0.800603,0.253886,0.80771,0.654612,0.432503,0.327505,0.238135,0.630466,0.670873,0.224682,0.706742,0.748957,0.987263,0.930475,0.05593,0.583187,0.79545,0.115819,0.576253,0.707298,0.154487,0.301954,0.582064,0.488696,0.385218,0.590455,0.291251,0.5569,0.874704,0.394148,0.554093,0.439597,0.309099,0.480095,0.052288,0.50578,0.467601,0.55477,0.448095,0.211274,0.62923,0.838101,0.533216,0.76032,0.441262,0.227556,0.303307,0.640368,0.375101,0.48004,0.332188,0.554301,0.557625,0.483335,0.292879,0.447403,0.101007,0.59342,0.492813,1.0,0.667524,0.295754,0.785652,0.247124,0.61481,0.450556,0.351858,0.281539,0.654814,0.894755,0.247404,0.871873,0.903784,0.807227,0.954028,0.1239,0.556567,0.445061,0.110588,0.316584,0.138251
4,0,1,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0.174752,0.505933,0.716012,0.234362,0.523167,0.382682,0.676841,0.811804,0.852104,0.483519,0.278596,0.548329,0.747265,0.666717,0.875715,0.865877,0.0,0.598407,0.381077,0.449217,0.50594,0.733857,0.931529,0.949096,0.641768,0.008805,0.560357,0.600098,0.430345,0.242375,0.547145,0.546188,0.055005,0.0,0.42616,0.192911,0.903958,0.205817,0.777937,0.679413,0.373151,0.283648,0.253374,0.507731,0.809817,0.377889,0.0,0.940562,0.999831,0.932552,0.767877,0.249125,0.86377,0.615396,0.538091,0.710419,0.19608,0.216527,0.544756,0.67928,0.500819,0.419206,0.240252,0.404362,0.594653,0.349367,0.467232,0.424598,0.351926,0.447259,0.130634,0.510112,0.582873,0.54739,0.488616,0.257198,0.593002,0.86872,0.614799,0.772112,0.365163,0.207254,0.231431,0.524024,0.321571,0.347322,0.304247,0.544002,0.516669,0.523519,0.293666,0.682298,0.100182,0.593473,0.461552,1.0,0.570613,0.244579,0.840194,0.330613,0.636019,0.484027,0.29193,0.271583,0.757533,0.922996,0.299318,0.819687,0.904905,0.823335,0.976112,0.37105,0.530355,0.514943,0.542468,0.273444,0.122601


In [5]:
# we still have to do this on all the datasets, since we then use them for performance evaluation
X_full = df.iloc[:,:-1].values
y_full = df.loc[:,targ].values
print(len(X_full))
print(len(y_full))
print("Matrix of features", X_full, sep='\n')
print("--------------------------------------------------")
print("Target Variable", y_full, sep='\n')

2452
2452
Matrix of features
[[0.         1.         0.         ... 0.51404655 0.98863451 0.04822394]
 [0.         1.         0.         ... 0.48327729 0.09533563 0.36044311]
 [0.         1.         0.         ... 0.472904   0.76404125 0.18961851]
 ...
 [1.         0.         0.         ... 0.40617632 0.0595795  0.40162675]
 [1.         0.         0.         ... 0.49072939 0.09946237 0.33475944]
 [1.         0.         0.         ... 0.53567876 0.11938145 0.27496883]]
--------------------------------------------------
Target Variable
[0.12952186 0.14823844 0.1047106  ... 0.4377949  0.48584461 0.54089336]


In [6]:
# Label Encode the target variable
if targ in categoricals:
    label_encoder = LabelEncoder()
    encoded_y = label_encoder.fit_transform(y_full)
    label_encoder_name_mapping = dict(zip(label_encoder.classes_,
                                            label_encoder.transform(label_encoder.classes_)))
    print("Mapping of Label Encoded Classes", label_encoder_name_mapping, sep="\n")
    print("Label Encoded Target Variable", encoded_y, sep="\n")
else:
    pass

In [7]:
# splitting df_train in training and testing sets (only the TRAIN dataset)
X = X_full[:1828].copy()
print(len(X))
y = y_full[:1828].copy()
print(len(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=43)
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

1828
1828
1279
549
1279
549


## Feature Selection (Dimensionality Reduction)

### Univariate Feature Selection

In [8]:
# Decide K ----> Number of best features to select
n = 15
if targ in categoricals:
    selector = SelectKBest(k=n)

    X_train_sel = selector.fit(X_train, y_train)
    cols_idxs = selector.get_support(indices=True)
    cols_idxs = np.append(cols_idxs,[-1])
    df_UFS = df.iloc[:,cols_idxs]
    X_train_sel = selector.transform(X_train)
    print(df_UFS.shape)
    df_UFS.head()
    X_test_sel = selector.transform(X_test)

    decision_tree = DecisionTreeClassifier()
    param_grid = {
        'min_samples_leaf': [x for x in range(1,30)],
        'max_depth': [x for x in range(1,30)],
        'criterion':['gini','entropy']
    }
    grid_cv = GridSearchCV(decision_tree, param_grid, scoring="accuracy", n_jobs=-1, cv=5)
    grid_cv.fit(X_train_sel, y_train)
    print("Best Params", grid_cv.best_params_)
    print("Best CV Score", grid_cv.best_score_)
    print(f'Accuracy on Model 1 = {round(accuracy_score(y_test, grid_cv.predict(X_test_sel)), 10)}')

    decision_tree = DecisionTreeClassifier(min_samples_leaf = grid_cv.best_params_['min_samples_leaf'], max_depth = grid_cv.best_params_['max_depth'], criterion=grid_cv.best_params_['criterion'])
    decision_tree.fit(X_train_sel, y_train)
    y_pred = decision_tree.predict(X_test_sel)

    label_encoder = LabelEncoder()
    encoded_y_test = label_encoder.fit_transform(y_test)
    label_encoder_name_mapping = dict(zip(label_encoder.classes_,
                                            label_encoder.transform(label_encoder.classes_)))
    y_score = decision_tree.predict_proba(X_test_sel)

    print(f'Accuracy: {accuracy_score(y_test, y_pred)}' )
    print(f'F1-score: {f1_score(y_test, y_pred, average=None)}')
    print(classification_report(y_test, y_pred, digits=3))
else:
    selector = SelectKBest(k=n, score_func=f_regression)

    X_train_sel = selector.fit(X_train, y_train)
    cols_idxs = selector.get_support(indices=True)
    cols_idxs = np.append(cols_idxs,[-1])
    df_UFS = df.iloc[:,cols_idxs]
    X_train_sel = selector.transform(X_train)
    print(df_UFS.shape)
    df_UFS.head()
    X_test_sel = selector.transform(X_test)

    reg = LinearRegression()

    reg.fit(X_train, y_train)
    # evaluate the model
    yhat = reg.predict(X_test)
    # evaluate predictions
    mae = mean_absolute_error(y_test, yhat)
    print('MAE: %.3f' % mae)

(2452, 16)
MAE: 0.009


In [9]:
df_UFS.head()

Unnamed: 0,mfcc_max,mfcc_q25,mfcc_q50,mfcc_q99,zc_sum_w1,mfcc_std_w1,mfcc_q25_w1,zc_sum_w2,sc_sum_w2,zc_sum_w3,mfcc_q99_w3,sc_sum_w3,zc_sum_w4,mfcc_std_w4,mfcc_q25_w4,zc_sum
0,0.717587,0.896695,0.479183,0.752627,0.062556,0.933467,0.679413,0.224991,0.210342,0.224149,0.833184,0.30751,0.448805,0.866067,0.653604,0.129522
1,0.598435,0.862466,0.479183,0.729264,0.149082,0.81913,0.679413,0.198031,0.226723,0.235869,0.839149,0.255697,0.396224,0.898993,0.632088,0.148238
2,0.667726,0.896088,0.479183,0.801084,0.149221,0.821121,0.701664,0.130543,0.250507,0.219039,0.845064,0.222941,0.36162,0.903513,0.611607,0.104711
3,0.584642,0.892183,0.50633,0.731318,0.10142,0.800603,0.654612,0.154487,0.291251,0.211274,0.76032,0.227556,0.492813,0.785652,0.61481,0.138251
4,0.676841,0.852104,0.483519,0.747265,0.055005,0.903958,0.679413,0.19608,0.240252,0.257198,0.772112,0.207254,0.461552,0.840194,0.636019,0.122601


### Recursive Feature Elimination

In [10]:
# decide number of features to select
#n_features = [n for n in range(10,16)]
#for i in n_features:
#    selector = RFE(DecisionTreeClassifier(), n_features_to_select = i)
#    X_train_sel = selector.fit(X_train, y_train)
#    cols_idxs = selector.get_support(indices=True)
#    cols_idxs = np.append(cols_idxs,[-1])
#    df_RFE = df.iloc[:,cols_idxs]
#    X_train_sel = selector.transform(X_train)
#    print(df_RFE.shape)
#
#    X_test_sel = selector.transform(X_test)
#
#    decision_tree = DecisionTreeClassifier(min_samples_leaf=6) #random_state=78)
#    decision_tree.fit(X_train_sel, y_train)
#
#    y_pred = decision_tree.predict(X_test_sel)
#
#    label_encoder = LabelEncoder()
#    encoded_y_test = label_encoder.fit_transform(y_test)
#    label_encoder_name_mapping = dict(zip(label_encoder.classes_,
#                                            label_encoder.transform(label_encoder.classes_)))
#    y_score = decision_tree.predict_proba(X_test_sel)
#
#    print(f'Accuracy: {accuracy_score(y_test, y_pred)}' )
#    print(f'F1-score: {f1_score(y_test, y_pred, average=None)}')
#    print(classification_report(y_test, y_pred, digits=3))

In [11]:
if targ in categoricals:   
    selector = RFE(DecisionTreeClassifier(), n_features_to_select = 15)
    X_train_sel = selector.fit(X_train, y_train)
    cols_idxs = selector.get_support(indices=True)
    cols_idxs = np.append(cols_idxs,[-1])
    df_RFE = df.iloc[:,cols_idxs]
    X_train_sel = selector.transform(X_train)
    print(df_RFE.shape)

    X_test_sel = selector.transform(X_test)

    decision_tree = DecisionTreeClassifier(min_samples_leaf=6) #random_state=78)
    decision_tree.fit(X_train_sel, y_train)

    y_pred = decision_tree.predict(X_test_sel)

    label_encoder = LabelEncoder()
    encoded_y_test = label_encoder.fit_transform(y_test)
    label_encoder_name_mapping = dict(zip(label_encoder.classes_,
                                            label_encoder.transform(label_encoder.classes_)))

    y_score = decision_tree.predict_proba(X_test_sel)

    print(f'Accuracy: {accuracy_score(y_test, y_pred)}' )
    print(f'F1-score: {f1_score(y_test, y_pred, average=None)}')
    print(classification_report(y_test, y_pred, digits=3))
else:
    pass

In [12]:
try:
    df_RFE.head()
except:
    pass

In [13]:
df_UFS.to_csv('data_UFS_advReg.csv', index=False)
#df_RFE.to_csv('data_RFE_advClass.csv', index=False)