In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import (NearMiss, OneSidedSelection,
                                     RandomUnderSampler)
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# Modelos preditivo para o CBIS-DDSM

## Importação e análise dos conjuntos de dados

### CBIS-DDSM

Iniciamos importando o conjunto de dados e realizamos a visualização das 5 primeiras linhas para verificar a estrutura dos dados.

In [2]:
breast_cancer_CBIS = pd.read_csv("../../outputs/mamografia/matriz_features_glcm_CBIS-DDSM.csv")
breast_cancer_CBIS.head()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5,pathology
0,0.796989,0.936951,0.853886,0.960574,1.221642,1.273973,1.215032,1.254506,0.99869,0.998135,...,0.474284,0.689123,0.689054,0.689091,0.689045,0.688969,0.688673,0.688494,0.688683,BENIGN
1,1.231098,1.446281,1.325998,1.504659,1.892971,1.935957,1.928993,2.03393,0.998911,0.998513,...,0.265534,0.516184,0.515934,0.516062,0.515914,0.515431,0.515333,0.515394,0.515299,BENIGN
2,0.786669,0.931541,0.833982,0.958347,1.226459,1.345679,1.3413,1.38896,0.998686,0.998076,...,0.481677,0.694554,0.694418,0.694455,0.694378,0.694268,0.69415,0.693838,0.69403,BENIGN_WITHOUT_CALLBACK
3,0.786669,0.931541,0.833982,0.958347,1.226459,1.345679,1.3413,1.38896,0.998686,0.998076,...,0.481677,0.694554,0.694418,0.694455,0.694378,0.694268,0.69415,0.693838,0.69403,BENIGN_WITHOUT_CALLBACK
4,1.147424,1.354507,1.186222,1.370834,1.769945,1.864975,1.730289,1.803568,0.998917,0.998541,...,0.30133,0.549273,0.549286,0.549392,0.549245,0.548992,0.548983,0.548912,0.548935,BENIGN_WITHOUT_CALLBACK


Visualizamos alguns parâmetros

In [3]:
breast_cancer_CBIS.describe()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_90_dist_5,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5
count,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,...,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0,3568.0
mean,0.85026,1.125499,0.970825,1.140524,1.678363,1.849828,1.726688,1.852181,0.999377,0.999049,...,0.285119,0.285255,0.523289,0.522212,0.522294,0.522197,0.522173,0.52164,0.521552,0.521638
std,0.299587,0.308796,0.272569,0.314063,0.39199,0.430324,0.431596,0.431823,0.000729,0.000828,...,0.120071,0.12029,0.113531,0.114375,0.114271,0.114375,0.114413,0.114682,0.114483,0.114682
min,0.146612,0.234696,0.202975,0.241642,0.44519,0.462666,0.415108,0.464166,0.988412,0.986565,...,0.001227,0.001163,0.05294,0.036735,0.037679,0.036739,0.035773,0.034094,0.035029,0.034109
25%,0.599825,0.917735,0.7844,0.922212,1.432621,1.562181,1.440706,1.565234,0.999252,0.9989,...,0.198883,0.198842,0.447699,0.446827,0.446798,0.446762,0.446622,0.445996,0.445963,0.445917
50%,0.819691,1.084508,0.939918,1.099446,1.656573,1.840881,1.712126,1.83481,0.999497,0.999203,...,0.271772,0.271853,0.523004,0.521996,0.522062,0.521966,0.521763,0.52139,0.521318,0.521395
75%,1.063334,1.301125,1.128357,1.325149,1.913821,2.09792,1.989356,2.109243,0.999714,0.999449,...,0.357349,0.357555,0.599002,0.598386,0.598368,0.598387,0.598552,0.598013,0.597787,0.597959
max,2.731951,2.99277,2.80844,2.985727,3.682224,4.281462,3.778196,3.886962,0.999965,0.999915,...,0.779089,0.77972,0.883105,0.883087,0.882996,0.883076,0.883131,0.883037,0.88266,0.883017


Como último passo, verificamos a distribuição das classes. Após análizar, foi visto que as classes BENIGN e MALIGNANT possuem 40% dos dados cada e os 20% restantes são da classe BENIGN WITHOUT CALLBACK. 

In [4]:
breast_cancer_CBIS['pathology'].value_counts(normalize=True)

MALIGNANT                  0.408352
BENIGN                     0.400504
BENIGN_WITHOUT_CALLBACK    0.191143
Name: pathology, dtype: float64

### CMMD

Importamos e visualizamos as 5 primeiras linhas do CMMD

In [5]:
breast_cancer_CMMD = pd.read_csv("../../outputs/mamografia/matriz_features_glcm_CMMD.csv")
breast_cancer_CMMD.head()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5,pathology
0,0.625476,0.707031,0.627003,0.705654,0.971535,1.002511,0.962851,1.005881,0.991534,0.989223,...,0.840798,0.917298,0.917229,0.917193,0.917241,0.917121,0.916889,0.916598,0.91695,Benign
1,1.184211,1.307271,1.150881,1.317487,1.731502,1.718066,1.622076,1.770468,0.989185,0.986792,...,0.660684,0.813474,0.813478,0.813445,0.813389,0.813061,0.813188,0.812932,0.812825,Benign
2,1.14027,1.30222,1.141987,1.289631,1.799599,1.92728,1.832726,1.854555,0.991724,0.989122,...,0.770689,0.878428,0.878354,0.878307,0.87834,0.878173,0.877946,0.877569,0.877889,Benign
3,1.682375,1.903251,1.64869,1.878657,2.547731,2.645756,2.406968,2.534075,0.987955,0.984806,...,0.596503,0.772685,0.772592,0.772693,0.772697,0.772142,0.771913,0.772185,0.772336,Benign
4,1.65904,1.898933,1.665683,1.884636,2.448483,2.618151,2.551372,2.555354,0.98604,0.981888,...,0.608706,0.781069,0.780941,0.78092,0.780936,0.780605,0.780213,0.779858,0.780196,Benign


Visualizamos alguns parâmetros

In [6]:
breast_cancer_CMMD.describe()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_90_dist_5,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5
count,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,...,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0,5590.0
mean,1.773209,1.983014,1.763502,1.982649,2.615231,2.74023,2.626762,2.725498,0.989032,0.986365,...,0.545747,0.545938,0.733159,0.73304,0.733079,0.733041,0.732563,0.732263,0.732159,0.732264
std,0.695062,0.776748,0.699679,0.778693,0.958755,1.01464,0.989343,1.013678,0.002661,0.004246,...,0.140213,0.140489,0.098357,0.098387,0.098328,0.098388,0.098589,0.098633,0.098449,0.098634
min,0.237596,0.25687,0.227297,0.25594,0.433435,0.417333,0.374059,0.41026,0.966044,0.787665,...,0.135658,0.135062,0.369938,0.369619,0.370014,0.369585,0.36802,0.367646,0.368317,0.367508
25%,1.256355,1.408674,1.249474,1.407995,1.897695,1.985317,1.905,1.981045,0.987367,0.984338,...,0.44667,0.446522,0.669229,0.669136,0.669211,0.669104,0.668471,0.668458,0.668334,0.668223
50%,1.67193,1.877374,1.662888,1.875448,2.476428,2.596584,2.48294,2.577388,0.98875,0.986109,...,0.548036,0.548208,0.741258,0.741153,0.741176,0.741122,0.740679,0.740381,0.740295,0.740411
75%,2.168485,2.421923,2.156655,2.42283,3.154367,3.312407,3.164098,3.287387,0.990376,0.98815,...,0.651789,0.652361,0.808386,0.808281,0.808246,0.808276,0.80797,0.807674,0.807335,0.807688
max,5.352412,8.845956,8.812825,8.835944,7.366577,7.916071,10.117438,7.881039,0.999791,0.99976,...,0.913343,0.913801,0.956095,0.956065,0.956033,0.95607,0.956003,0.955906,0.95569,0.95593


Verificamos a distribuição das classes. Após analisar, foi observado que o conjunto de dados possui um desbalanceamento de classes, com aproximadamente 75% dos dados pertencentes à classe MALIGNANT e os outros 25% à classe BENIGNA. 

In [7]:
breast_cancer_CMMD['pathology'].value_counts(normalize=True)

Malignant    0.746691
Benign       0.253309
Name: pathology, dtype: float64

### INBREAST

Importamos e visualizamos as 5 primeiras linhas do conjunto de dados

In [8]:
breast_cancer_INBREAST = pd.read_csv("../../outputs/mamografia/matriz_features_glcm_INBREAST.csv")
breast_cancer_INBREAST.head()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5,pathology
0,0.016037,0.017468,0.015719,0.017442,0.0288,0.029273,0.027298,0.029408,0.99737,0.997112,...,0.780883,0.884444,0.884331,0.884372,0.884333,0.883883,0.883673,0.883513,0.883676,NORMAL
1,0.017885,0.019595,0.017781,0.019532,0.031353,0.032461,0.03054,0.032441,0.997383,0.997089,...,0.743717,0.863376,0.863225,0.863291,0.863231,0.862655,0.86238,0.862244,0.86239,BENIGN
2,0.027353,0.029184,0.026601,0.029546,0.045597,0.044503,0.040502,0.045867,0.996965,0.99679,...,0.656304,0.811537,0.811385,0.811521,0.811337,0.810315,0.810302,0.8103,0.810126,NORMAL
3,0.026346,0.028427,0.025535,0.028125,0.044639,0.044735,0.039885,0.043967,0.996876,0.996582,...,0.663569,0.815822,0.81563,0.815802,0.815675,0.814642,0.81445,0.814561,0.814597,BENIGN
4,0.083185,0.09112,0.083896,0.089706,0.123119,0.134849,0.129895,0.129309,0.994243,0.993665,...,0.298135,0.555955,0.55403,0.555682,0.554339,0.547608,0.545026,0.545581,0.546017,MALIGNANT


Visualizamos alguns parâmetros

In [9]:
breast_cancer_INBREAST.describe()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_90_dist_5,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5
count,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,...,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0,410.0
mean,0.043193,0.047588,0.043223,0.047593,0.069278,0.073229,0.070425,0.073181,0.996339,0.995909,...,0.485857,0.485654,0.696396,0.695743,0.696306,0.695743,0.69319,0.692452,0.692614,0.692456
std,0.017056,0.018096,0.017268,0.018078,0.02228,0.023896,0.023826,0.02386,0.000974,0.001034,...,0.109828,0.109933,0.076649,0.076975,0.076672,0.076976,0.078232,0.078574,0.078477,0.07857
min,0.007549,0.009356,0.007691,0.009553,0.019388,0.020682,0.01997,0.021142,0.992977,0.992312,...,0.247958,0.246541,0.506457,0.505075,0.506576,0.504923,0.497711,0.497059,0.497954,0.496529
25%,0.030933,0.034055,0.030617,0.034185,0.052997,0.055882,0.052813,0.055589,0.995666,0.995232,...,0.403328,0.403074,0.639723,0.638973,0.639572,0.639092,0.636148,0.63457,0.635081,0.634881
50%,0.041357,0.045935,0.041207,0.04593,0.06835,0.071516,0.068556,0.071516,0.996489,0.996025,...,0.480162,0.480064,0.696523,0.69591,0.69634,0.695952,0.693722,0.692841,0.692937,0.692867
75%,0.054082,0.059064,0.054332,0.060109,0.084599,0.090279,0.087357,0.091268,0.997045,0.996687,...,0.558391,0.558716,0.749465,0.749086,0.74934,0.749142,0.747916,0.747241,0.747256,0.747473
max,0.097078,0.103722,0.098008,0.105129,0.129938,0.134865,0.135936,0.139875,0.999165,0.998935,...,0.821789,0.822115,0.907254,0.907179,0.90719,0.907176,0.906854,0.906717,0.906526,0.906706


Verificamos a distribuição das classes. Após analisar, foi observado que o conjunto de dados possui um desbalanceamento de classes, com aproximadamente 75% dos dados pertencentes à classe BENIGN e os outros 25% à classe MALIGNANT. 

In [10]:
breast_cancer_INBREAST['pathology'].value_counts(normalize=True)

BENIGN       0.592683
MALIGNANT    0.243902
NORMAL       0.163415
Name: pathology, dtype: float64

## Pré-Processamento

### CBIS-DDSM

Para ter o conjunto de dados com classes balanceadas, eliminamos os dados com classe BENIGN WITHOUT CALLBACK. 

In [11]:
breast_cancer_CBIS = breast_cancer_CBIS[breast_cancer_CBIS['pathology'] != 'BENIGN_WITHOUT_CALLBACK']
breast_cancer_CBIS['pathology'].value_counts(normalize=True)

MALIGNANT    0.504851
BENIGN       0.495149
Name: pathology, dtype: float64

Nesta Etapa, realizamos a separação dos dados em Features e labels

In [12]:
X_CBIS, y_CBIS = breast_cancer_CBIS.drop('pathology', axis=1), breast_cancer_CBIS['pathology']
X_CBIS.shape, y_CBIS.shape

((2886, 48), (2886,))

Para treinar e avaliar os modelos, realizamos a divisão dos dados em conjunto de validação e conjunto de teste. Utilizamos 20% dos dados para o conjunto de validação e os 80% restantes para o conjunto de teste, utilizando validação cruzada.

In [13]:
X_gs_CBIS, X_cv_CBIS, y_gs_CBIS, y_cv_CBIS = train_test_split(X_CBIS, y_CBIS, test_size=0.80)
X_gs_CBIS.shape, X_cv_CBIS.shape, y_gs_CBIS.shape, y_cv_CBIS.shape

((577, 48), (2309, 48), (577,), (2309,))

Realizamos a padronização dos dados, pois os modelos costumam apresentar um melhor desempenho quando aplicados a dados padronizados.

In [14]:
columns = X_CBIS.columns

scaler = StandardScaler()
X_gs_CBIS = pd.DataFrame(scaler.fit_transform(X_gs_CBIS), columns=columns)
X_cv_CBIS = pd.DataFrame(scaler.transform(X_cv_CBIS), columns=columns)

Na etapa final de pré-processamento, realizamos o tratamento das variáveis categóricas utilizando a abordagem Label Enconding. 

In [15]:
y_gs_CBIS = y_gs_CBIS.map({'BENIGN': 0, 'MALIGNANT': 1})
y_cv_CBIS = y_cv_CBIS.map({'BENIGN': 0, 'MALIGNANT': 1})

### CMMD

Nesta Etapa, realizamos a separação dos dados em Features e labels

In [16]:
X_CMMD, y_CMMD = breast_cancer_CMMD.drop('pathology', axis=1), breast_cancer_CMMD['pathology']
X_CMMD.shape, y_CMMD.shape

((5590, 48), (5590,))

Para balancear o conjunto de dados, utilizamos o método de Undersampling. A escolha desse método foi devido ao conjuntos de dados ter uma grande quantidade de dados.

In [17]:
nm = NearMiss(version=1)
X_CMMD, y_CMMD = nm.fit_resample(X_CMMD, y_CMMD)

y_CMMD.value_counts(normalize=True)

Benign       0.5
Malignant    0.5
Name: pathology, dtype: float64

Dividimos os dados em validação e teste. 

In [18]:
X_gs_CMMD, X_cv_CMMD, y_gs_CMMD, y_cv_CMMD = train_test_split(X_CMMD, y_CMMD, test_size=0.80)
X_gs_CMMD.shape, X_cv_CMMD.shape, y_gs_CMMD.shape, y_cv_CMMD.shape

y_gs_CMMD.value_counts(normalize=True), y_cv_CMMD.value_counts(normalize=True)

(Benign       0.503534
 Malignant    0.496466
 Name: pathology, dtype: float64,
 Malignant    0.500883
 Benign       0.499117
 Name: pathology, dtype: float64)

Realizamos a padronização dos dados

In [19]:
columns = X_CMMD.columns

scaler = StandardScaler()
X_gs_CMMD = pd.DataFrame(scaler.fit_transform(X_gs_CMMD), columns=columns)
X_cv_CMMD = pd.DataFrame(scaler.transform(X_cv_CMMD), columns=columns)

Transformação dos dados categóricos

In [20]:
y_gs_CMMD = y_gs_CMMD.map({'Benign': 0, 'Malignant': 1})
y_cv_CMMD = y_cv_CMMD.map({'Benign': 0, 'Malignant': 1})

### INBREAST

Para fins de padronização, excluímos os dados com classe NORMAL do conjunto de dados

In [21]:
breast_cancer_INBREAST = breast_cancer_INBREAST[breast_cancer_INBREAST['pathology'] != 'NORMAL']
breast_cancer_INBREAST['pathology'].value_counts(normalize=True)

BENIGN       0.708455
MALIGNANT    0.291545
Name: pathology, dtype: float64

Realizamos a divisão dos dados em features e label

In [22]:
X_INBREAST, y_INBREAST = breast_cancer_INBREAST.drop('pathology', axis=1), breast_cancer_INBREAST['pathology']
X_INBREAST.shape, y_INBREAST.shape

((343, 48), (343,))

Realizamos o balanceamento das classes. Pelo fato do conjunto de dados ser pequeno, utilizamos um método de Overshampling. Os métodos de Overshampling geram novas amostras da classe minoritária.

In [23]:
smote = SMOTE()
X_INBREAST, y_INBREAST = smote.fit_resample(X_INBREAST, y_INBREAST)

y_INBREAST.value_counts(normalize=True)

BENIGN       0.5
MALIGNANT    0.5
Name: pathology, dtype: float64

Dividimos os dados em validação e teste.

In [24]:
X_gs_INBREAST, X_cv_INBREAST, y_gs_INBREAST, y_cv_INBREAST = train_test_split(X_INBREAST, y_INBREAST, test_size=0.80)
X_gs_INBREAST.shape, X_cv_INBREAST.shape, y_gs_INBREAST.shape, y_cv_INBREAST.shape

y_gs_INBREAST.value_counts(normalize=True), y_cv_INBREAST.value_counts(normalize=True)

(BENIGN       0.505155
 MALIGNANT    0.494845
 Name: pathology, dtype: float64,
 MALIGNANT    0.501285
 BENIGN       0.498715
 Name: pathology, dtype: float64)

Realizamos a padronização dos dados

In [25]:
columns = X_INBREAST.columns

scaler = StandardScaler()
X_gs_INBREAST = pd.DataFrame(scaler.fit_transform(X_gs_INBREAST), columns=columns)
X_cv_INBREAST = pd.DataFrame(scaler.transform(X_cv_INBREAST), columns=columns)

Tratamos os dados categóricos

In [26]:
y_gs_INBREAST = y_gs_INBREAST.map({'BENIGN': 0, 'MALIGNANT': 1})
y_cv_INBREAST = y_cv_INBREAST.map({'BENIGN': 0, 'MALIGNANT': 1})

## Treinamento dos modelos

Função para fazer validação cruzada

In [27]:
def cross_validate(model, data_cv, target_cv, n_splits):
    kf = StratifiedKFold(n_splits = n_splits)
    
    acc = []
    predicts = []
    for train_index, test_index in kf.split(data_cv, target_cv): 
        
        model.fit(data_cv.iloc[train_index], target_cv.iloc[train_index])
        y_pred = model.predict(data_cv.iloc[test_index])
        acc.append(accuracy_score(y_pred, target_cv.iloc[test_index]))
        predicts.append(y_pred)

    return (np.array(acc)).mean() * 100, predicts

def cross_validate_deep(model, data_cv, target_cv, n_splits, n_epochs, batch_size):
    kf = StratifiedKFold(n_splits = n_splits)
    
    acc = []
    predicts = []
    for train_index, test_index in kf.split(data_cv, target_cv): 
        
        model.fit(data_cv.iloc[train_index], target_cv.iloc[train_index], epochs=n_epochs, batch_size=batch_size)
        y_pred = model.predict(data_cv.iloc[test_index])
        
        # Definir o limiar
        threshold = 0.5

        # Transformar as saídas em rótulos
        y_pred = (y_pred > threshold).astype(int)
        
        acc.append(accuracy_score(y_pred, target_cv.iloc[test_index]))
        predicts.append(y_pred)

    return (np.array(acc)).mean() * 100, predicts

In [28]:
columns = [column for column in X_cv_CBIS.columns if f'correlation' not in column]
X_cv_CBIS = X_cv_CBIS.loc[::, columns]

### KNeighborsClassifier

Seleção dos melhores parâmetros

In [29]:
""" parameters = {'n_neighbors': [1 ,2 ,3 ,4 , 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 
              'weights' : ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan'],
              'algorithm': ['auto', 'brute', 'ball_tree', 'kd_tree']} # quais parâmetros e quais valores serão testados

clf = GridSearchCV(KNeighborsClassifier(), parameters, cv=5) # clf vai armazenar qual foi a melhor configuração
clf.fit(X_gs_CBIS, y_gs_CBIS)

print(clf.best_params_) """

# {'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}

" parameters = {'n_neighbors': [1 ,2 ,3 ,4 , 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], \n              'weights' : ['uniform', 'distance'],\n              'metric': ['euclidean', 'manhattan'],\n              'algorithm': ['auto', 'brute', 'ball_tree', 'kd_tree']} # quais parâmetros e quais valores serão testados\n\nclf = GridSearchCV(KNeighborsClassifier(), parameters, cv=5) # clf vai armazenar qual foi a melhor configuração\nclf.fit(X_gs_CBIS, y_gs_CBIS)\n\nprint(clf.best_params_) "

Treinamento

In [30]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
cKNN = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean', algorithm='auto')
acurracy, predict = cross_validate(cKNN, X_cv_CBIS, y_cv_CBIS, 5)

predicts = {'KNeighborsClassifier': { 'all': predict } }
scores = {'KNeighborsClassifier': { 'all': { 'accuracy_score': acurracy } }}

for distance in distances:
    for angle in angles:
        columns = [column for column in X_cv_CBIS.columns if f'ang_135_dist_1' in column]
        X_cv_CBIS_filtered = X_cv_CBIS.loc[::, columns]
        
        cKNN = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean', algorithm='auto')
        acurracy, predict = cross_validate(cKNN, X_cv_CBIS_filtered, y_cv_CBIS, 5)
        
        predicts['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = { 'accuracy_score': acurracy }

### Random Forest Classifier

Seleção dos melhores parâmetros

In [31]:
""" parameters = {'n_estimators': [x for x in range(50, 550, 50)], 
              'max_depth' : [x for x in range(5, 11, 1)],
              'criterion': ['gini', 'entropy'],
              'min_samples_split': [2, 5],
              'min_samples_leaf': [x for x in range(1, 6)]} # quais parâmetros e quais valores serão testados

clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5) # clf vai armazenar qual foi a melhor configuração
clf.fit(X_gs_CBIS, y_gs_CBIS)

print(clf.best_params_) """

# {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 200}

" parameters = {'n_estimators': [x for x in range(50, 550, 50)], \n              'max_depth' : [x for x in range(5, 11, 1)],\n              'criterion': ['gini', 'entropy'],\n              'min_samples_split': [2, 5],\n              'min_samples_leaf': [x for x in range(1, 6)]} # quais parâmetros e quais valores serão testados\n\nclf = GridSearchCV(RandomForestClassifier(), parameters, cv=5) # clf vai armazenar qual foi a melhor configuração\nclf.fit(X_gs_CBIS, y_gs_CBIS)\n\nprint(clf.best_params_) "

Treinamento

In [32]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
mlp = RandomForestClassifier(n_estimators=200, max_depth=10, criterion='gini', min_samples_leaf=3, min_samples_split=2)
acurracy, predict = cross_validate(mlp, X_cv_CBIS, y_cv_CBIS, 5)

predicts['RandomForestClassifier'] = { 'all': predict }
scores['RandomForestClassifier'] = { 'all': { 'accuracy_score': acurracy } }

for distance in distances:
    for angle in angles:
        columns = [column for column in X_cv_CBIS.columns if f'ang_135_dist_1' in column]
        X_cv_CBIS_filtered = X_cv_CBIS.loc[::, columns]
        
        cKNN = RandomForestClassifier(n_estimators=200, max_depth=10, criterion='gini', 
                                      min_samples_leaf=3, min_samples_split=2)
        acurracy, predict = cross_validate(cKNN, X_cv_CBIS_filtered, y_cv_CBIS, 5)
        
        predicts['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = { 'accuracy_score': acurracy }

### Multilayer Perceptron

Seleção dos melhores parâmetros

In [33]:
""" def create_model(n_neurons, dropout):
    model = Sequential()
    model.add(Dense(50, input_dim=X_gs_CBIS.shape[1], activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(n_neurons, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model)
#optimizers = ['rmsprop', 'adam']
epochs = np.array([100, 150, 200, 250, 300])
batches = np.array([5, 10, 20])
n_neurons = np.array([20, 50])
dropouts = np.array([0.0, 0.2, 0.5])

param_grid = dict(nb_epoch=epochs, batch_size=batches, n_neurons=n_neurons, dropout=dropouts)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_result = grid.fit(X_gs_CBIS, y_gs_CBIS)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
 """
# Best: 0.629075 using {'batch_size': 5, 'dropout': 0.2, 'n_neurons': 50, 'nb_epoch': 100}

' def create_model(n_neurons, dropout):\n    model = Sequential()\n    model.add(Dense(50, input_dim=X_gs_CBIS.shape[1], activation=\'relu\'))\n    model.add(Dropout(dropout))\n    model.add(Dense(n_neurons, activation=\'relu\'))\n    model.add(Dropout(dropout))\n    model.add(Dense(1, activation=\'sigmoid\'))\n    model.compile(loss=\'binary_crossentropy\', optimizer=\'rmsprop\', metrics=[\'accuracy\'])\n    return model\n\nmodel = KerasClassifier(build_fn=create_model)\n#optimizers = [\'rmsprop\', \'adam\']\nepochs = np.array([100, 150, 200, 250, 300])\nbatches = np.array([5, 10, 20])\nn_neurons = np.array([20, 50])\ndropouts = np.array([0.0, 0.2, 0.5])\n\nparam_grid = dict(nb_epoch=epochs, batch_size=batches, n_neurons=n_neurons, dropout=dropouts)\ngrid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=\'accuracy\')\ngrid_result = grid.fit(X_gs_CBIS, y_gs_CBIS)\n\nprint("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))\n '

Treinamento

In [34]:
# Função para criar o modelo do Keras
def create_model(shape):
    model = Sequential()
    model.add(Dense(50, input_dim=shape, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

angles = ['0', '45', '90', '135']
distances = ['1', '5']

columns = [column for column in X_cv_CBIS.columns if f'ang_135_dist_1' in column]
X_cv_CBIS_filtered = X_cv_CBIS.loc[::, columns]

# Treina com todos os angulos e distancias
mlp = create_model(X_cv_CBIS.shape[1])
acurracy, predict = cross_validate_deep(mlp, X_cv_CBIS, y_cv_CBIS, 5, 100, 5)

predicts['MultilayerPerceptron'] = { 'ang_135_dist_1': predict }
scores['MultilayerPerceptron'] = { 'ang_135_dist_1': { 'accuracy_score': acurracy } }

# Treina com todos os angulos e distancias
""" mlp = create_model(X_cv_CBIS.shape[1])
acurracy, predict = cross_validate_deep(mlp, X_cv_CBIS, y_cv_CBIS, 5, 100, 5)

predicts['MultilayerPerceptron'] = { 'all': predict }
scores['MultilayerPerceptron'] = { 'all': { 'accuracy_score': acurracy } } """

""" for distance in distances:
    for angle in angles:
        columns =  [column for column in X_cv_CBIS.columns if f'ang_{angle}_dist_{distance}' in column]
        X_cv_CBIS_filtered = X_cv_CBIS.loc[::, columns]
        
        mlp = create_model(X_cv_CBIS_filtered.shape[1])
        acurracy, predict = cross_validate_deep(mlp, X_cv_CBIS_filtered, y_cv_CBIS, 5, 100, 5)
        
        predicts['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = predict
        scores['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = { 'accuracy_score': acurracy } """

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

" for distance in distances:\n    for angle in angles:\n        columns =  [column for column in X_cv_CBIS.columns if f'ang_{angle}_dist_{distance}' in column]\n        X_cv_CBIS_filtered = X_cv_CBIS.loc[::, columns]\n        \n        mlp = create_model(X_cv_CBIS_filtered.shape[1])\n        acurracy, predict = cross_validate_deep(mlp, X_cv_CBIS_filtered, y_cv_CBIS, 5, 100, 5)\n        \n        predicts['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = predict\n        scores['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = { 'accuracy_score': acurracy } "

## Avaliação dos modelos

Avaliamos o modelo Random Forest Classifier

In [35]:
scores_cfr = pd.DataFrame(scores['RandomForestClassifier'])
scores_cfr

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,62.710464,60.804763,60.54521,60.501639,61.151647,60.934445,60.631697,60.848147,60.371956


Avaliamos o modelo Kneighbor Classifier

In [36]:
scores_cKNN = pd.DataFrame(scores['KNeighborsClassifier'])
scores_cKNN

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,60.373459,59.635744,59.635744,59.635744,59.635744,59.635744,59.635744,59.635744,59.635744


In [37]:
scores_mlp = pd.DataFrame(scores['MultilayerPerceptron'])
scores_mlp

Unnamed: 0,ang_135_dist_1
accuracy_score,65.094139
