In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

# Modelos preditivo para o CBIS-DDSM

## Importação

Iniciamos com a importação do conjunto de dados e realizamos a visualização das 5 primeiras linhas para verificar a estrutura dos dados.

In [2]:
breast_cancer = pd.read_csv("../../outputs/mamografia/matriz_features_glcm.csv")
breast_cancer.head()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5,pathology
0,0.796989,0.936951,0.853886,0.960574,1.221642,1.273973,1.215032,1.254506,0.99869,0.998135,...,0.474284,0.689123,0.689054,0.689091,0.689045,0.688969,0.688673,0.688494,0.688683,BENIGN
1,1.231098,1.446281,1.325998,1.504659,1.892971,1.935957,1.928993,2.03393,0.998911,0.998513,...,0.265534,0.516184,0.515934,0.516062,0.515914,0.515431,0.515333,0.515394,0.515299,BENIGN
2,0.786669,0.931541,0.833982,0.958347,1.226459,1.345679,1.3413,1.38896,0.998686,0.998076,...,0.481677,0.694554,0.694418,0.694455,0.694378,0.694268,0.69415,0.693838,0.69403,BENIGN_WITHOUT_CALLBACK
3,0.786669,0.931541,0.833982,0.958347,1.226459,1.345679,1.3413,1.38896,0.998686,0.998076,...,0.481677,0.694554,0.694418,0.694455,0.694378,0.694268,0.69415,0.693838,0.69403,BENIGN_WITHOUT_CALLBACK
4,1.147424,1.354507,1.186222,1.370834,1.769945,1.864975,1.730289,1.803568,0.998917,0.998541,...,0.30133,0.549273,0.549286,0.549392,0.549245,0.548992,0.548983,0.548912,0.548935,BENIGN_WITHOUT_CALLBACK


Como próximo passo, verificamos a distribuição das classes. Após análizar, foi visto que cerca de 60% dos dados são da classe BENIGN e o restante 40% são da classe MALIGNANT. 

In [3]:
breast_cancer['pathology'] = breast_cancer['pathology'].replace("BENIGN_WITHOUT_CALLBACK", "BENIGN")
breast_cancer['pathology'].value_counts(normalize=True)

BENIGN       0.591648
MALIGNANT    0.408352
Name: pathology, dtype: float64

Nesta Etapa, realizamos a separação dos dados em Features e labels

In [4]:
X, y = breast_cancer.drop('pathology', axis=1), breast_cancer['pathology']
X.shape, y.shape

((3568, 48), (3568,))

Como etapa final do estágio de importação, realizamos a divisão dos dados em treino e teste.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2676, 48), (892, 48), (2676,), (892,))

## Pré-Processamento

Na etapa de pré-processamento, realizamos o tratamento das variáveis categóricas utilizando a abordagem Label Enconding. 

In [6]:
y_train = y_train.map({'BENIGN': 0, 'MALIGNANT': 1})
y_test = y_test.map({'BENIGN': 0, 'MALIGNANT': 1})

## Treinamento e Previsão dos modelos

Random Forest Classifier

In [7]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
cfr = RandomForestClassifier(n_estimators=150, max_depth=11)
cfr.fit(X_train, y_train)
predict = cfr.predict(X_test)

predicts = {'RandomForestClassifier': { 'all': predict }}
scores = {'RandomForestClassifier': {'all': {'accuracy_score': accuracy_score(y_test, predict),
                                             'recall_score': recall_score(y_test, predict),
                                             'precision_score': precision_score(y_test, predict)}}}

for distance in distances:
    for angle in angles:
        columns =  [column for column in X_train.columns if f'ang_{angle}_dist_{distance}' in column]
        X_train_filtered = X_train.loc[::, columns]
        X_test_filtered = X_test.loc[::, columns]
        
        cfr = RandomForestClassifier(n_estimators=150, max_depth=11)
        cfr.fit(X_train_filtered, y_train)
        predict = cfr.predict(X_test_filtered)
        
        predicts['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = {
                                                'accuracy_score': accuracy_score(y_test, predict),
                                                'recall_score': recall_score(y_test, predict),
                                                'precision_score': precision_score(y_test, predict)
                                            }

KNeighborsClassifier

In [8]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
cKNN = KNeighborsClassifier(n_neighbors=4)
cKNN.fit(X_train, y_train)
predict = cKNN.predict(X_test)

predicts['KNeighborsClassifier'] = { 'all': predict }
scores['KNeighborsClassifier'] = {'all': {'accuracy_score': accuracy_score(y_test, predict),
                                             'recall_score': recall_score(y_test, predict),
                                             'precision_score': precision_score(y_test, predict)}}

for distance in distances:
    for angle in angles:
        columns =  [column for column in X_train.columns if f'ang_{angle}_dist_{distance}' in column]
        X_train_filtered = X_train.loc[::, columns]
        X_test_filtered = X_test.loc[::, columns]
        
        cKNN = KNeighborsClassifier(n_neighbors=4)
        cKNN.fit(X_train_filtered, y_train)
        predict = cKNN.predict(X_test_filtered)
        
        predicts['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = {
                                                'accuracy_score': accuracy_score(y_test, predict),
                                                'recall_score': recall_score(y_test, predict),
                                                'precision_score': precision_score(y_test, predict)
                                        }

Multilayer Perceptron

In [9]:
# Função para criar o modelo do Keras
def create_model(shape):
    model = Sequential()
    model.add(Dense(100, input_dim=shape, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
model = create_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=150, batch_size=32, verbose=0)
predict = model.predict(X_test)
predict = np.vectorize(lambda value: 1 if value > 0.5 else 0)(predict)

predicts['MultilayerPerceptron'] = { 'all': predict }
scores['MultilayerPerceptron'] = {'all': {'accuracy_score': accuracy_score(y_test, predict),
                                             'recall_score': recall_score(y_test, predict),
                                             'precision_score': precision_score(y_test, predict)}}

for distance in distances:
    for angle in angles:
        columns =  [column for column in X_train.columns if f'ang_{angle}_dist_{distance}' in column]
        X_train_filtered = X_train.loc[::, columns]
        X_test_filtered = X_test.loc[::, columns]
        
        model = create_model(X_train_filtered.shape[1])
        model.fit(X_train_filtered, y_train, epochs=150, batch_size=32, verbose=0)
        predict = model.predict(X_test_filtered)
        predict = np.vectorize(lambda value: 1 if value > 0.5 else 0)(predict)
        
        predicts['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = predict
        scores['MultilayerPerceptron'][f'ang_{angle}_dist_{distance}'] = {
                                                'accuracy_score': accuracy_score(y_test, predict),
                                                'recall_score': recall_score(y_test, predict),
                                                'precision_score': precision_score(y_test, predict)
                                        }



## Avaliação dos modelos

Avaliamos o modelo Random Forest Classifier

In [10]:
scores_cfr = pd.DataFrame(scores['RandomForestClassifier'])
scores_cfr

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,0.688341,0.668161,0.654709,0.670404,0.659193,0.658072,0.653587,0.653587,0.668161
recall_score,0.448468,0.373259,0.401114,0.428969,0.406685,0.367688,0.359331,0.376045,0.35376
precision_score,0.66805,0.653659,0.607595,0.633745,0.616034,0.628571,0.620192,0.613636,0.664921


Avaliamos o modelo Kneighbor Classifier

In [11]:
scores_cKNN = pd.DataFrame(scores['KNeighborsClassifier'])
scores_cKNN

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,0.634529,0.619955,0.626682,0.624439,0.610987,0.595291,0.600897,0.595291,0.610987
recall_score,0.331476,0.29805,0.289694,0.328691,0.286908,0.259053,0.264624,0.300836,0.295265
precision_score,0.580488,0.551546,0.571429,0.556604,0.530928,0.494681,0.508021,0.495413,0.53


In [12]:
scores_mlp = pd.DataFrame(scores['MultilayerPerceptron'])
scores_mlp

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,0.665919,0.623318,0.625561,0.642377,0.624439,0.588565,0.616592,0.637892,0.610987
recall_score,0.481894,0.116992,0.164345,0.345404,0.167131,0.091922,0.108635,0.192201,0.125348
precision_score,0.607018,0.688525,0.634409,0.596154,0.625,0.445946,0.639344,0.676471,0.576923
