Experimento 12 - Linear Interpolation Imputer, 3sigma outlier detection, Z-Score normalization, MLP classifier

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn.objects as so
from ucimlrepo import fetch_ucirepo

In [2]:
beans = fetch_ucirepo(id=602)
df = beans.data.features
targets = beans.data.targets

In [3]:
cols = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'Roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

In [4]:
import random

#Introducing Missing values (5%)

for index, i in enumerate(df):
  for jndex, j in enumerate(df[i]):
    if random.randint(0,100) < 5:
      df.loc[jndex,i] = np.NaN

In [5]:
df.isna().sum()

Area               668
Perimeter          689
MajorAxisLength    686
MinorAxisLength    695
AspectRatio        652
Eccentricity       620
ConvexArea         678
EquivDiameter      681
Extent             689
Solidity           671
Roundness          713
Compactness        667
ShapeFactor1       676
ShapeFactor2       678
ShapeFactor3       732
ShapeFactor4       675
dtype: int64

In [6]:
df

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,,0.998724
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,,0.953861,0.006979,0.003564,0.909851,0.998430
2,29380.0,624.110,212.826130,175.931143,,0.562727,29690.0,,0.778113,0.989559,0.947849,0.908774,,0.003048,0.825871,0.999066
3,30008.0,,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,,0.007017,0.003215,0.861794,0.994199
4,30140.0,620.134,201.847882,190.279279,1.060798,0.333680,30417.0,,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,,0.999166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,,0.006858,0.001749,0.642988,0.998385
13607,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,
13608,42139.0,759.321,,191.187979,1.472582,0.734065,42569.0,231.631261,0.729932,0.989899,,0.822730,0.006681,0.001888,0.676884,0.996767
13609,42147.0,763.779,283.382636,190.275731,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222


In [7]:
#Imputing Missing Values with Linear Interpolation
df_imputed = df.interpolate(method="nearest", order=3, limit=None,
                            limit_direction='both').ffill().bfill()
df_imputed

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.909851,0.998724
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.958027,0.953861,0.006979,0.003564,0.909851,0.998430
2,29380.0,624.110,212.826130,175.931143,1.097356,0.562727,29690.0,191.272751,0.778113,0.989559,0.947849,0.908774,0.006979,0.003048,0.825871,0.999066
3,30008.0,624.110,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.908774,0.007017,0.003215,0.861794,0.994199
4,30140.0,620.134,201.847882,190.279279,1.060798,0.333680,30417.0,195.467062,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.861794,0.999166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,0.836460,0.006858,0.001749,0.642988,0.998385
13607,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.515799,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998385
13608,42139.0,759.321,281.576392,191.187979,1.472582,0.734065,42569.0,231.631261,0.729932,0.989899,0.922015,0.822730,0.006681,0.001888,0.676884,0.996767
13609,42147.0,763.779,283.382636,190.275731,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222


In [8]:
#adding the labels
df_imputed['Class'] = targets
df_pre_missing_values = df.copy()
print(df_imputed.groupby('Class').agg(['count']))
df_imputed['Class'] = df_imputed['Class'].transform(lambda x: 0 if x == 'BARBUNYA' else (1 if x == 'BOMBAY' else (2 if x == 'CALI' else (3 if x == 'DERMASON' else (4 if x == 'HOROZ' else (5 if x == 'SEKER' else 6))))))

          Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count     count           count           count       count   
Class                                                                  
BARBUNYA  1322      1322            1322            1322        1322   
BOMBAY     522       522             522             522         522   
CALI      1630      1630            1630            1630        1630   
DERMASON  3546      3546            3546            3546        3546   
HOROZ     1928      1928            1928            1928        1928   
SEKER     2027      2027            2027            2027        2027   
SIRA      2636      2636            2636            2636        2636   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                                      
BARBUNYA         1322       1322          1322   13

In [9]:
#Outlier Removal with 3sigma
from scipy import stats


print(f'Pre Outlier Shape: {df_imputed.shape}')
df_no_outliers = df_imputed.copy()

print(df_no_outliers['Class'].unique())
for i in df_no_outliers['Class'].unique():
    class_unique = df_no_outliers[df_no_outliers['Class'] == i]
    for feature in class_unique:
      upper = class_unique[feature].mean() + (3 * class_unique[feature].std())
      lower = class_unique[feature].mean() - (3 * class_unique[feature].std())
      excluded = pd.Series(class_unique[class_unique[feature] < lower].index)
      #print(excluded.values)
      df_no_outliers.drop(excluded.values, inplace = True, errors='ignore')

print(df_no_outliers.groupby('Class').count())
print(f'Pos Outlier Shape: {df_no_outliers.shape}')


Pre Outlier Shape: (13611, 17)
[5 0 1 2 4 6 3]
       Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRatio  \
Class                                                                   
0      1261       1261             1261             1261         1261   
1       497        497              497              497          497   
2      1578       1578             1578             1578         1578   
3      3415       3415             3415             3415         3415   
4      1807       1807             1807             1807         1807   
5      1919       1919             1919             1919         1919   
6      2531       2531             2531             2531         2531   

       Eccentricity  ConvexArea  EquivDiameter  Extent  Solidity  Roundness  \
Class                                                                         
0              1261        1261           1261    1261      1261       1261   
1               497         497            497     497    

In [10]:
df_no_outliers['Class'] = df_no_outliers['Class'].transform(lambda x: 'BARBUNYA' if x == 0 else ('BOMBAY' if x == 1 else ('CALI' if x == 2 else ('DERMASON' if x == 3 else ('HOROZ' if x == 4 else ('SEKER' if x == 5 else 'SIRA'))))))
df_no_outliers = df_no_outliers.reset_index()
print(df_no_outliers.groupby('Class').agg(['count']))

         index  Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count count     count           count           count       count   
Class                                                                        
BARBUNYA  1261  1261      1261            1261            1261        1261   
BOMBAY     497   497       497             497             497         497   
CALI      1578  1578      1578            1578            1578        1578   
DERMASON  3415  3415      3415            3415            3415        3415   
HOROZ     1807  1807      1807            1807            1807        1807   
SEKER     1919  1919      1919            1919            1919        1919   
SIRA      2531  2531      2531            2531            2531        2531   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                              

In [11]:
df_no_outliers = df_no_outliers.drop(['index'], axis='columns')
df_no_outliers

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.909851,0.998724,SEKER
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.958027,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380.0,624.110,212.826130,175.931143,1.097356,0.562727,29690.0,191.272751,0.778113,0.989559,0.947849,0.908774,0.006979,0.003048,0.825871,0.999066,SEKER
3,30279.0,634.927,212.560556,181.510182,1.171067,0.520401,30600.0,196.347702,0.775688,0.989510,0.943852,0.923726,0.007020,0.003153,0.853270,0.999236,SEKER
4,30477.0,670.033,211.050155,184.039050,1.146768,0.489478,30970.0,196.988633,0.762402,0.984081,0.853080,0.933374,0.006925,0.003242,0.871186,0.999049,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13003,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,0.836460,0.006858,0.001749,0.642988,0.998385,DERMASON
13004,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.515799,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998385,DERMASON
13005,42139.0,759.321,281.576392,191.187979,1.472582,0.734065,42569.0,231.631261,0.729932,0.989899,0.922015,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
13006,42147.0,763.779,283.382636,190.275731,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [12]:
#Normalization with Z-Score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(df_no_outliers[cols])
df_scaled = pd.DataFrame(scaler.transform(df_no_outliers[cols]), columns = cols)

df_scaled['Class'] = df_no_outliers['Class']
df_scaled

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,-0.843317,-1.141615,-1.308972,-0.631972,-1.568610,-2.223043,-0.843951,-1.065581,0.278140,0.351222,1.442120,1.855382,0.684811,2.411027,2.711268,0.920668,SEKER
1,-0.831725,-1.011927,-1.398323,-0.434550,-1.973414,-3.744892,-0.828548,-1.046429,0.688392,-0.671623,1.442120,2.516391,0.370591,3.111578,2.711268,0.840464,SEKER
2,-0.809635,-1.076980,-1.254707,-0.586389,-1.973414,-2.080644,-0.811090,-1.046429,0.568566,0.536948,1.264534,1.780579,0.370591,2.243508,1.857504,1.014176,SEKER
3,-0.778893,-1.026385,-1.257808,-0.461873,-1.674538,-2.547327,-0.780419,-0.960542,0.518941,0.524006,1.194780,2.024591,0.407464,2.420265,2.136051,1.060525,SEKER
4,-0.772123,-0.862183,-1.275441,-0.405432,-1.773062,-2.888273,-0.767948,-0.949695,0.247011,-0.910589,-0.389064,2.182039,0.322716,2.570421,2.318196,1.009421,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13003,-0.374775,-0.442800,-0.368640,-0.362901,-0.127002,0.149587,-0.379071,-0.365363,-0.731818,0.741088,0.719329,0.600416,0.263571,0.058339,-0.001735,0.828146,DERMASON
13004,-0.374639,-0.453076,-0.452060,-0.256477,-0.436333,-0.173467,-0.379543,-0.365363,1.015332,0.852190,0.813765,0.368542,0.111853,0.288427,0.334880,0.828146,DERMASON
13005,-0.373339,-0.444554,-0.452060,-0.245879,-0.451975,-0.191521,-0.377015,-0.363409,-0.417499,0.626795,0.813765,0.376336,0.105712,0.292526,0.342866,0.386089,DERMASON
13006,-0.373066,-0.423703,-0.430972,-0.266239,-0.384081,-0.114451,-0.373712,-0.363037,-0.919801,0.075478,0.567584,0.290293,0.143518,0.231537,0.254953,-0.035986,DERMASON


In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate


x_train = df_scaled.iloc[:, 0:16]
y_train = df_scaled.iloc[:, 16]
classifier = MLPClassifier(activation='logistic', solver='adam', alpha=1e-5, hidden_layer_sizes=(12, 3), random_state=1, verbose=True, learning_rate_init=0.3, tol=1e-3, max_iter=500)
scoring = {'acc' : 'accuracy',
           'prec' : 'precision_macro',
           'recall' : 'recall_macro',
           'f1' : 'f1_macro'}


y_pred = cross_validate(classifier, x_train, y_train, cv=10, scoring=scoring, return_train_score=True)
print('Experimento 12')
print('Acurácia Média: ' + '%.2f' % (np.mean(y_pred['test_acc'])*100) + '%')
print('Precisão Média: ' + '%.2f' % (np.mean(y_pred['test_prec'])*100) + '%')
print('Revocação Média: ' + '%.2f' % (np.mean(y_pred['test_recall'])*100) + '%')
print('F1-Score Médio: ' + '%.2f' % (np.mean(y_pred['test_f1'])*100) + '%')


Iteration 1, loss = 1.19385607
Iteration 2, loss = 0.85826310
Iteration 3, loss = 0.56864499
Iteration 4, loss = 0.50971298
Iteration 5, loss = 0.33920637
Iteration 6, loss = 0.22458447
Iteration 7, loss = 0.19126820
Iteration 8, loss = 0.19883450
Iteration 9, loss = 0.18849355
Iteration 10, loss = 0.18589094
Iteration 11, loss = 0.18930835
Iteration 12, loss = 0.16973104
Iteration 13, loss = 0.16631811
Iteration 14, loss = 0.17018963
Iteration 15, loss = 0.16689053
Iteration 16, loss = 0.17376167
Iteration 17, loss = 0.16673802
Iteration 18, loss = 0.19233249
Iteration 19, loss = 0.18215390
Iteration 20, loss = 0.16405820
Iteration 21, loss = 0.19418281
Iteration 22, loss = 0.16354400
Iteration 23, loss = 0.16839186
Iteration 24, loss = 0.20378246
Iteration 25, loss = 0.17844230
Iteration 26, loss = 0.17467153
Iteration 27, loss = 0.17343411
Iteration 28, loss = 0.17845131
Iteration 29, loss = 0.16844675
Iteration 30, loss = 0.15405575
Iteration 31, loss = 0.16250191
Iteration 32, los