Experimento 16 - Linear Interpolation Imputation, MAD outlier detection, Z-Score normalization, MLP classifier

In [26]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn.objects as so
from ucimlrepo import fetch_ucirepo

In [27]:
beans = fetch_ucirepo(id=602)
df = beans.data.features
targets = beans.data.targets

In [28]:
cols = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'Roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

In [29]:
#adding the labels
df['Class'] = targets
df_pre_missing_values = df.copy()
print(df.groupby('Class').agg(['count']))
df['Class'] = df['Class'].transform(lambda x: 0 if x == 'BARBUNYA' else (1 if x == 'BOMBAY' else (2 if x == 'CALI' else (3 if x == 'DERMASON' else (4 if x == 'HOROZ' else (5 if x == 'SEKER' else 6))))))

          Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count     count           count           count       count   
Class                                                                  
BARBUNYA  1322      1322            1322            1322        1322   
BOMBAY     522       522             522             522         522   
CALI      1630      1630            1630            1630        1630   
DERMASON  3546      3546            3546            3546        3546   
HOROZ     1928      1928            1928            1928        1928   
SEKER     2027      2027            2027            2027        2027   
SIRA      2636      2636            2636            2636        2636   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                                      
BARBUNYA         1322       1322          1322   13

In [30]:
#Outlier Removal with MAD
from scipy import stats

print(f'Pre Outlier Shape: {df.shape}')

df_no_outliers = df.copy()
print(df_no_outliers['Class'].unique())
for i in df_no_outliers['Class'].unique():
    class_unique = df_no_outliers[df_no_outliers['Class'] == i]
    for feature in class_unique:
      mad = 1.4826 * np.median(np.absolute(class_unique[feature] - class_unique[feature].median()))
      #print(mad)
      upper = class_unique[feature].median() + (3 * mad)
      lower = class_unique[feature].median() - (3 * mad)
      excluded_lower = pd.Series(class_unique[class_unique[feature] < lower].index)
      excluded_upper = pd.Series(class_unique[class_unique[feature] > upper].index)
      df_no_outliers.drop(excluded_lower.values, inplace = True, errors='ignore')
      df_no_outliers.drop(excluded_upper.values, inplace = True, errors='ignore')


print(f'Pos Outlier Shape: {df_no_outliers.shape}')

Pre Outlier Shape: (13611, 17)
[5 0 1 2 4 6 3]
Pos Outlier Shape: (12198, 17)


In [31]:
df_no_outliers['Class'] = df_no_outliers['Class'].transform(lambda x: 'BARBUNYA' if x == 0 else ('BOMBAY' if x == 1 else ('CALI' if x == 2 else ('DERMASON' if x == 3 else ('HOROZ' if x == 4 else ('SEKER' if x == 5 else 'SIRA'))))))
df_no_outliers = df_no_outliers.reset_index()
print(df_no_outliers.groupby('Class').agg(['count']))

         index  Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count count     count           count           count       count   
Class                                                                        
BARBUNYA  1193  1193      1193            1193            1193        1193   
BOMBAY     472   472       472             472             472         472   
CALI      1512  1512      1512            1512            1512        1512   
DERMASON  3215  3215      3215            3215            3215        3215   
HOROZ     1641  1641      1641            1641            1641        1641   
SEKER     1762  1762      1762            1762            1762        1762   
SIRA      2403  2403      2403            2403            2403        2403   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                              

In [32]:
df_no_outliers = df_no_outliers.drop(['index'], axis='columns')
df_no_outliers

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,29380,624.110,212.826130,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
2,30279,634.927,212.560556,181.510182,1.171067,0.520401,30600,196.347702,0.775688,0.989510,0.943852,0.923726,0.007020,0.003153,0.853270,0.999236,SEKER
3,30519,629.727,212.996755,182.737204,1.165591,0.513760,30847,197.124320,0.770682,0.989367,0.967109,0.925480,0.006979,0.003158,0.856514,0.998345,SEKER
4,30685,635.681,213.534145,183.157146,1.165852,0.514081,31044,197.659696,0.771561,0.988436,0.954240,0.925658,0.006959,0.003152,0.856844,0.998953,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12193,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385,DERMASON
12194,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219,DERMASON
12195,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
12196,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [33]:
#Normalization with Z-Score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(df_no_outliers[cols])
df_scaled = pd.DataFrame(scaler.transform(df_no_outliers[cols]), columns = cols)

df_scaled['Class'] = df_no_outliers['Class']
df_scaled

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,-0.849054,-1.145239,-1.315645,-0.638833,-1.576380,-2.251665,-0.849310,-1.072789,0.264907,0.304790,1.428298,1.863735,0.700289,2.430380,1.955690,0.954328,SEKER
1,-0.815319,-1.080532,-1.261260,-0.593146,-1.525318,-2.107666,-0.816390,-1.017380,0.556502,0.507774,1.249673,1.788499,0.621722,2.261789,1.870087,1.058474,SEKER
2,-0.784529,-1.029882,-1.264367,-0.468347,-1.682918,-2.579593,-0.785665,-0.967613,0.506677,0.493630,1.179511,2.033924,0.420945,2.439676,2.150917,1.110096,SEKER
3,-0.776309,-1.054231,-1.259263,-0.440900,-1.705251,-2.653638,-0.777325,-0.954453,0.403799,0.452347,1.587695,2.062722,0.384248,2.449000,2.184170,0.838899,SEKER
4,-0.770624,-1.026352,-1.252976,-0.431506,-1.704185,-2.650056,-0.770674,-0.945381,0.421875,0.183415,1.361828,2.065645,0.366092,2.437595,2.187549,1.024040,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12193,-0.379778,-0.445659,-0.373231,-0.369150,-0.126473,0.147623,-0.383603,-0.371660,-0.749112,0.730884,0.701278,0.033645,0.276016,0.062648,-0.004392,0.851280,DERMASON
12194,-0.379641,-0.455946,-0.456835,-0.262484,-0.437585,-0.179061,-0.384076,-0.371474,1.005065,0.852309,0.796266,0.368287,0.123207,0.294208,0.334982,0.800586,DERMASON
12195,-0.378339,-0.447415,-0.457262,-0.251862,-0.453317,-0.197317,-0.381544,-0.369704,-0.433529,0.605970,0.733237,0.376125,0.117021,0.298333,0.343033,0.358930,DERMASON
12196,-0.378065,-0.426540,-0.435701,-0.272268,-0.385032,-0.119381,-0.378235,-0.369331,-0.937851,0.003423,0.548645,0.289584,0.155099,0.236954,0.254400,-0.111163,DERMASON


In [34]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate


x_train = df_scaled.iloc[:, 0:16]
y_train = df_scaled.iloc[:, 16]
classifier = MLPClassifier(activation='logistic', solver='adam', alpha=1e-5, hidden_layer_sizes=(12, 3), random_state=1, verbose=True, learning_rate_init=0.3, tol=1e-3, max_iter=500)
scoring = {'acc' : 'accuracy',
           'prec' : 'precision_macro',
           'recall' : 'recall_macro',
           'f1' : 'f1_macro'}


y_pred = cross_validate(classifier, x_train, y_train, cv=10, scoring=scoring, return_train_score=True)
print('Experimento 16')
print('Acurácia Média: ' + '%.2f' % (np.mean(y_pred['test_acc'])*100) + '%')
print('Precisão Média: ' + '%.2f' % (np.mean(y_pred['test_prec'])*100) + '%')
print('Revocação Média: ' + '%.2f' % (np.mean(y_pred['test_recall'])*100) + '%')
print('F1-Score Médio: ' + '%.2f' % (np.mean(y_pred['test_f1'])*100) + '%')


Iteration 1, loss = 1.22522409
Iteration 2, loss = 0.78290889
Iteration 3, loss = 0.52785082
Iteration 4, loss = 0.37035421
Iteration 5, loss = 0.37070454
Iteration 6, loss = 0.35178675
Iteration 7, loss = 0.35324611
Iteration 8, loss = 0.33819123
Iteration 9, loss = 0.31433031
Iteration 10, loss = 0.30797133
Iteration 11, loss = 0.30500809
Iteration 12, loss = 0.31009914
Iteration 13, loss = 0.30478057
Iteration 14, loss = 0.30242965
Iteration 15, loss = 0.28623141
Iteration 16, loss = 0.28607353
Iteration 17, loss = 0.30265468
Iteration 18, loss = 0.29052759
Iteration 19, loss = 0.27291652
Iteration 20, loss = 0.26171572
Iteration 21, loss = 0.25804438
Iteration 22, loss = 0.21654335
Iteration 23, loss = 0.21085686
Iteration 24, loss = 0.21759954
Iteration 25, loss = 0.19382927
Iteration 26, loss = 0.17694611
Iteration 27, loss = 0.19403728
Iteration 28, loss = 0.21884553
Iteration 29, loss = 0.18643494
Iteration 30, loss = 0.16950896
Iteration 31, loss = 0.16518462
Iteration 32, los