Experimento 12 - Linear Interpolation Imputer, 3sigma outlier detection, Z-Score normalization, MLP classifier

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn.objects as so
from ucimlrepo import fetch_ucirepo

In [2]:
beans = fetch_ucirepo(id=602)
df = beans.data.features
targets = beans.data.targets

In [3]:
cols = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'Roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']

In [4]:
import random

#Introducing Missing values (5%)

for index, i in enumerate(df):
  for jndex, j in enumerate(df[i]):
    if random.randint(0,100) < 5:
      df.loc[jndex,i] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i][jndex] = np.NaN


In [5]:
df.isna().sum()

Area               672
Perimeter          672
MajorAxisLength    677
MinorAxisLength    684
AspectRatio        682
Eccentricity       652
ConvexArea         702
EquivDiameter      682
Extent             695
Solidity           675
Roundness          647
Compactness        700
ShapeFactor1       717
ShapeFactor2       662
ShapeFactor3       685
ShapeFactor4       662
dtype: int64

In [6]:
df

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430
2,29380.0,624.110,212.826130,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,,0.003048,0.825871,0.999066
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199
4,30140.0,620.134,201.847882,190.279279,1.060798,0.333680,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,
13607,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,,,0.998219
13608,42139.0,759.321,281.539928,191.187979,1.472582,0.734065,42569.0,,0.729932,0.989899,0.918424,,0.006681,0.001888,0.676884,0.996767
13609,42147.0,763.779,283.382636,,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222


In [7]:
#Imputing Missing Values with Linear Interpolation
df_imputed = df.interpolate(method="nearest", order=3, limit=None,
                            limit_direction='both').ffill().bfill()
df_imputed

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430
2,29380.0,624.110,212.826130,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.006979,0.003048,0.825871,0.999066
3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199
4,30140.0,620.134,201.847882,190.279279,1.060798,0.333680,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998176
13607,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001749,0.642988,0.998219
13608,42139.0,759.321,281.539928,191.187979,1.472582,0.734065,42569.0,231.526798,0.729932,0.989899,0.918424,0.822252,0.006681,0.001888,0.676884,0.996767
13609,42147.0,763.779,283.382636,191.187979,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222


In [8]:
#adding the labels
df_imputed['Class'] = targets
df_pre_missing_values = df.copy()
print(df_imputed.groupby('Class').agg(['count']))
df_imputed['Class'] = df_imputed['Class'].transform(lambda x: 0 if x == 'BARBUNYA' else (1 if x == 'BOMBAY' else (2 if x == 'CALI' else (3 if x == 'DERMASON' else (4 if x == 'HOROZ' else (5 if x == 'SEKER' else 6))))))

          Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count     count           count           count       count   
Class                                                                  
BARBUNYA  1322      1322            1322            1322        1322   
BOMBAY     522       522             522             522         522   
CALI      1630      1630            1630            1630        1630   
DERMASON  3546      3546            3546            3546        3546   
HOROZ     1928      1928            1928            1928        1928   
SEKER     2027      2027            2027            2027        2027   
SIRA      2636      2636            2636            2636        2636   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                                      
BARBUNYA         1322       1322          1322   13

In [9]:
#Outlier Removal with 3sigma
from scipy import stats


print(f'Pre Outlier Shape: {df_imputed.shape}')
df_no_outliers = df_imputed.copy()

print(df_no_outliers['Class'].unique())
for i in df_no_outliers['Class'].unique():
    class_unique = df_no_outliers[df_no_outliers['Class'] == i]
    for feature in class_unique:
      upper = class_unique[feature].mean() + (3 * class_unique[feature].std())
      lower = class_unique[feature].mean() - (3 * class_unique[feature].std())
      excluded = pd.Series(class_unique[class_unique[feature] < lower].index)
      #print(excluded.values)
      df_no_outliers.drop(excluded.values, inplace = True, errors='ignore')

print(df_no_outliers.groupby('Class').count())
print(f'Pos Outlier Shape: {df_no_outliers.shape}')


Pre Outlier Shape: (13611, 17)
[5 0 1 2 4 6 3]
       Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRatio  \
Class                                                                   
0      1261       1261             1261             1261         1261   
1       493        493              493              493          493   
2      1579       1579             1579             1579         1579   
3      3410       3410             3410             3410         3410   
4      1803       1803             1803             1803         1803   
5      1916       1916             1916             1916         1916   
6      2538       2538             2538             2538         2538   

       Eccentricity  ConvexArea  EquivDiameter  Extent  Solidity  Roundness  \
Class                                                                         
0              1261        1261           1261    1261      1261       1261   
1               493         493            493     493    

In [10]:
df_no_outliers['Class'] = df_no_outliers['Class'].transform(lambda x: 'BARBUNYA' if x == 0 else ('BOMBAY' if x == 1 else ('CALI' if x == 2 else ('DERMASON' if x == 3 else ('HOROZ' if x == 4 else ('SEKER' if x == 5 else 'SIRA'))))))
df_no_outliers = df_no_outliers.reset_index()
print(df_no_outliers.groupby('Class').agg(['count']))

         index  Area Perimeter MajorAxisLength MinorAxisLength AspectRatio  \
         count count     count           count           count       count   
Class                                                                        
BARBUNYA  1261  1261      1261            1261            1261        1261   
BOMBAY     493   493       493             493             493         493   
CALI      1579  1579      1579            1579            1579        1579   
DERMASON  3410  3410      3410            3410            3410        3410   
HOROZ     1803  1803      1803            1803            1803        1803   
SEKER     1916  1916      1916            1916            1916        1916   
SIRA      2538  2538      2538            2538            2538        2538   

         Eccentricity ConvexArea EquivDiameter Extent Solidity Roundness  \
                count      count         count  count    count     count   
Class                                                              

In [11]:
df_no_outliers = df_no_outliers.drop(['index'], axis='columns')
df_no_outliers

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380.0,624.110,212.826130,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.006979,0.003048,0.825871,0.999066,SEKER
3,30279.0,620.134,212.560556,181.510182,1.171067,0.520401,30600.0,196.347702,0.775688,0.989510,0.943852,0.923726,0.007020,0.003153,0.853270,0.999236,SEKER
4,30477.0,670.033,211.050155,184.039050,1.146768,0.520401,30970.0,196.988633,0.762402,0.984081,0.853080,0.923726,0.006925,0.003242,0.871186,0.999236,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12995,42097.0,759.696,288.721612,185.944705,1.552728,0.765002,42508.0,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998176,DERMASON
12996,42101.0,757.499,281.576392,190.713136,1.476439,0.735702,42494.0,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001749,0.642988,0.998219,DERMASON
12997,42139.0,759.321,281.539928,191.187979,1.472582,0.734065,42569.0,231.526798,0.729932,0.989899,0.918424,0.822252,0.006681,0.001888,0.676884,0.996767,DERMASON
12998,42147.0,763.779,283.382636,191.187979,1.489326,0.741055,42667.0,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [12]:
#Normalization with Z-Score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(df_no_outliers[cols])
df_scaled = pd.DataFrame(scaler.transform(df_no_outliers[cols]), columns = cols)

df_scaled['Class'] = df_no_outliers['Class']
df_scaled

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,-0.845446,-1.143608,-1.310847,-0.633366,-1.569864,-2.236292,-0.845966,-1.067826,0.278764,0.352359,1.443240,1.850821,0.685703,2.417571,1.945001,0.920161,SEKER
1,-0.833813,-1.013681,-1.400370,-0.435304,-1.975283,-3.764837,-0.830511,-1.048626,0.688445,-0.670286,0.205364,2.511190,0.371373,3.119785,2.715213,0.840314,SEKER
2,-0.811645,-1.078853,-1.256477,-0.587635,-1.519017,-2.093266,-0.812994,-1.012350,0.568786,0.538048,1.265778,1.776090,0.371373,2.249654,1.859945,1.013252,SEKER
3,-0.780796,-1.097484,-1.259584,-0.462716,-1.675953,-2.562003,-0.782221,-0.962525,0.519230,0.525109,1.196072,2.019865,0.408259,2.426830,2.138982,1.059395,SEKER
4,-0.774002,-0.863661,-1.277252,-0.406092,-1.774627,-2.562003,-0.769708,-0.951651,0.247678,-0.909205,-0.386670,2.019865,0.323482,2.577343,2.321448,1.059395,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12995,-0.375263,-0.443507,-0.368704,-0.363423,-0.126066,0.146778,-0.379528,-0.365864,-0.729788,0.742149,0.720951,0.033026,0.264317,0.059297,-0.002571,0.771164,DERMASON
12996,-0.375126,-0.453802,-0.452284,-0.256654,-0.435867,-0.177698,-0.380001,-0.365678,1.014931,0.853229,0.815322,0.365419,0.112546,0.059297,-0.002571,0.782738,DERMASON
12997,-0.373822,-0.445264,-0.452710,-0.246022,-0.451533,-0.195831,-0.377465,-0.365678,-0.415906,0.627878,0.752703,0.365419,0.106403,0.294040,0.342637,0.387961,DERMASON
12998,-0.373547,-0.424374,-0.431156,-0.246022,-0.383535,-0.118422,-0.374151,-0.363532,-0.917509,0.076669,0.569312,0.287246,0.144222,0.232906,0.254569,-0.032234,DERMASON


In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate


x_train = df_scaled.iloc[:, 0:16]
y_train = df_scaled.iloc[:, 16]
classifier = MLPClassifier(activation='logistic', solver='adam', alpha=1e-5, hidden_layer_sizes=(12, 3), random_state=1, verbose=True, learning_rate_init=0.3, tol=1e-3, max_iter=500)
scoring = {'acc' : 'accuracy',
           'prec' : 'precision_macro',
           'recall' : 'recall_macro',
           'f1' : 'f1_macro'}


y_pred = cross_validate(classifier, x_train, y_train, cv=10, scoring=scoring, return_train_score=True)
print('Experimento 12')
print('Acurácia Média: ' + '%.2f' % (np.mean(y_pred['test_acc'])*100) + '%')
print('Precisão Média: ' + '%.2f' % (np.mean(y_pred['test_prec'])*100) + '%')
print('Revocação Média: ' + '%.2f' % (np.mean(y_pred['test_recall'])*100) + '%')
print('F1-Score Médio: ' + '%.2f' % (np.mean(y_pred['test_f1'])*100) + '%')


Iteration 1, loss = 1.16136886
Iteration 2, loss = 0.64292893
Iteration 3, loss = 0.49733745
Iteration 4, loss = 0.47080276
Iteration 5, loss = 0.45592648
Iteration 6, loss = 0.43828026
Iteration 7, loss = 0.43485763
Iteration 8, loss = 0.42719378
Iteration 9, loss = 0.41970146
Iteration 10, loss = 0.34039292
Iteration 11, loss = 0.28279107
Iteration 12, loss = 0.27252825
Iteration 13, loss = 0.26747249
Iteration 14, loss = 0.23289053
Iteration 15, loss = 0.21622002
Iteration 16, loss = 0.20022900
Iteration 17, loss = 0.20597074
Iteration 18, loss = 0.20187298
Iteration 19, loss = 0.17921407
Iteration 20, loss = 0.18611421
Iteration 21, loss = 0.17536257
Iteration 22, loss = 0.19396260
Iteration 23, loss = 0.18249806
Iteration 24, loss = 0.18139748
Iteration 25, loss = 0.17987554
Iteration 26, loss = 0.18751937
Iteration 27, loss = 0.18049249
Iteration 28, loss = 0.18399520
Iteration 29, loss = 0.17570437
Iteration 30, loss = 0.17582911
Iteration 31, loss = 0.16098210
Iteration 32, los