In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, precision_score, recall_score)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier

' from keras.models import Sequential\nfrom keras.layers import Dense '

# Modelos preditivo para o CBIS-DDSM

## Importação

Iniciamos com a importação do conjunto de dados e realizamos a visualização das 5 primeiras linhas para verificar a estrutura dos dados.

In [2]:
breast_cancer = pd.read_csv("../../outputs/mamografia/matriz_features_glcm.csv")
breast_cancer.head()

Unnamed: 0,dissimilarity_ang_0_dist_1,dissimilarity_ang_45_dist_1,dissimilarity_ang_90_dist_1,dissimilarity_ang_135_dist_1,dissimilarity_ang_0_dist_5,dissimilarity_ang_45_dist_5,dissimilarity_ang_90_dist_5,dissimilarity_ang_135_dist_5,correlation_ang_0_dist_1,correlation_ang_45_dist_1,...,ASM_ang_135_dist_5,energy_ang_0_dist_1,energy_ang_45_dist_1,energy_ang_90_dist_1,energy_ang_135_dist_1,energy_ang_0_dist_5,energy_ang_45_dist_5,energy_ang_90_dist_5,energy_ang_135_dist_5,pathology
0,0.867058,0.989288,0.869534,0.958089,1.558669,1.609807,1.330008,1.438225,0.998038,0.997318,...,0.374557,0.613519,0.612996,0.612982,0.613068,0.612237,0.611475,0.611724,0.61201,BENIGN
1,1.59157,1.756165,1.622403,1.851661,2.675529,2.538154,2.502785,2.851484,0.998133,0.997718,...,0.065602,0.259952,0.259348,0.260183,0.259283,0.255886,0.256496,0.258937,0.256129,BENIGN
2,0.880291,0.962439,0.879317,1.023709,1.631688,1.470731,1.372944,1.684637,0.997972,0.997525,...,0.356365,0.600135,0.599574,0.599586,0.59944,0.597867,0.597475,0.597672,0.596963,BENIGN_WITHOUT_CALLBACK
3,0.880291,0.962439,0.879317,1.023709,1.631688,1.470731,1.372944,1.684637,0.997972,0.997525,...,0.356365,0.600135,0.599574,0.599586,0.59944,0.597867,0.597475,0.597672,0.596963,BENIGN_WITHOUT_CALLBACK
4,1.496904,1.725108,1.533977,1.677406,2.473641,2.634147,2.381861,2.410005,0.998009,0.997348,...,0.090927,0.304731,0.304179,0.304973,0.304221,0.300906,0.301545,0.303951,0.301541,BENIGN_WITHOUT_CALLBACK


Como próximo passo, verificamos a distribuição das classes. Após análizar, foi visto que cerca de 60% dos dados são da classe BENIGN e o restante 40% são da classe MALIGNANT. 

In [3]:
breast_cancer['pathology'] = breast_cancer['pathology'].replace("BENIGN_WITHOUT_CALLBACK", "BENIGN")
breast_cancer['pathology'].value_counts(normalize=True)

BENIGN       0.591648
MALIGNANT    0.408352
Name: pathology, dtype: float64

Nesta Etapa, realizamos a separação dos dados em Features e labels

In [4]:
X, y = breast_cancer.drop('pathology', axis=1), breast_cancer['pathology']
X.shape, y.shape

((3568, 48), (3568,))

Como etapa final do estágio de importação, realizamos a divisão dos dados em treino e teste.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2676, 48), (892, 48), (2676,), (892,))

## Pré-Processamento

Na etapa de pré-processamento, realizamos o tratamento das variáveis categóricas utilizando a abordagem Label Enconding. 

In [6]:
y_train = y_train.map({'BENIGN': 0, 'MALIGNANT': 1})
y_test = y_test.map({'BENIGN': 0, 'MALIGNANT': 1})

## Treinamento e Previsão dos modelos

Random Forest Classifier

In [7]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
cfr = RandomForestClassifier(n_estimators=150, max_depth=11)
cfr.fit(X_train, y_train)
predict = cfr.predict(X_test)

predicts = {'RandomForestClassifier': { 'all': predict }}
scores = {'RandomForestClassifier': {'all': {'accuracy_score': accuracy_score(y_test, predict),
                                             'recall_score': recall_score(y_test, predict),
                                             'precision_score': precision_score(y_test, predict)}}}

for distance in distances:
    for angle in angles:
        columns =  [column for column in X_train.columns if f'ang_{angle}_dist_{distance}' in column]
        X_train_filtered = X_train.loc[::, columns]
        X_test_filtered = X_test.loc[::, columns]
        
        cfr = RandomForestClassifier(n_estimators=150, max_depth=11)
        cfr.fit(X_train_filtered, y_train)
        predict = cfr.predict(X_test_filtered)
        
        predicts['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['RandomForestClassifier'][f'ang_{angle}_dist_{distance}'] = {
                                                'accuracy_score': accuracy_score(y_test, predict),
                                                'recall_score': recall_score(y_test, predict),
                                                'precision_score': precision_score(y_test, predict)
                                            }

KNeighborsClassifier

In [8]:
angles = ['0', '45', '90', '135']
distances = ['1', '5']

# Treina com todos os angulos e distancias
cKNN = KNeighborsClassifier(n_neighbors=4)
cKNN.fit(X_train, y_train)
predict = cKNN.predict(X_test)

predicts['KNeighborsClassifier'] = { 'all': predict }
scores['KNeighborsClassifier'] = {'all': {'accuracy_score': accuracy_score(y_test, predict),
                                             'recall_score': recall_score(y_test, predict),
                                             'precision_score': precision_score(y_test, predict)}}

for distance in distances:
    for angle in angles:
        columns =  [column for column in X_train.columns if f'ang_{angle}_dist_{distance}' in column]
        X_train_filtered = X_train.loc[::, columns]
        X_test_filtered = X_test.loc[::, columns]
        
        cKNN = KNeighborsClassifier(n_neighbors=4)
        cKNN.fit(X_train_filtered, y_train)
        predict = cKNN.predict(X_test_filtered)
        
        predicts['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = predict
        scores['KNeighborsClassifier'][f'ang_{angle}_dist_{distance}'] = {
                                                'accuracy_score': accuracy_score(y_test, predict),
                                                'recall_score': recall_score(y_test, predict),
                                                'precision_score': precision_score(y_test, predict)
                                        }

## Avaliação dos modelos

Avaliamos o modelo Random Forest Classifier

In [9]:
scores_cfr = pd.DataFrame(scores['RandomForestClassifier'])
scores_cfr

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,0.642377,0.636771,0.625561,0.640135,0.633408,0.616592,0.616592,0.615471,0.605381
recall_score,0.32,0.330667,0.306667,0.322667,0.330667,0.301333,0.317333,0.293333,0.285333
precision_score,0.652174,0.629442,0.608466,0.643617,0.62,0.585492,0.580488,0.585106,0.560209


Avaliamos o modelo Kneighbor Classifier

In [10]:
scores_cKNN = pd.DataFrame(scores['KNeighborsClassifier'])
scores_cKNN

Unnamed: 0,all,ang_0_dist_1,ang_45_dist_1,ang_90_dist_1,ang_135_dist_1,ang_0_dist_5,ang_45_dist_5,ang_90_dist_5,ang_135_dist_5
accuracy_score,0.588565,0.589686,0.58296,0.595291,0.579596,0.602018,0.567265,0.57287,0.576233
recall_score,0.237333,0.290667,0.256,0.258667,0.234667,0.264,0.221333,0.224,0.242667
precision_score,0.523529,0.521531,0.507937,0.538889,0.5,0.55618,0.468927,0.482759,0.491892
