# Detecção de câncer de mama do banco de imagens Winsconsin

Objetivo - Prever se o câncer é benigno ou maligno.

Conjunto de dados: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

## Bibliotecas

In [123]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

## Carregamento dos dados

In [124]:
df = pd.read_csv('../data/data_cancer2.csv', index_col=0).reset_index(drop=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


### Analisar os dados

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [126]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,0.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,


## Pré processamento

### Trata valores faltantes

In [127]:
df.isna().sum()

diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: i

In [128]:
df.drop(columns=['Unnamed: 32'], inplace=True)

Eliminamos a coluna Unnamed: 32 devido a todos os dados dela serem nulos

### Separar em Features e Labels

In [129]:
X, y = df.drop(columns=['diagnosis']), df['diagnosis']

### Trata dados categóricos

In [130]:
y = y.map({'M': 1, 'B': 0})

Transformamos os dados Malignos para 1 e os Benignos para 0.

### Dividir dados em treino e teste

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((398, 30), (171, 30))

Utilizamos 70% dos dados para treino e 30% para teste.

### Balancear os dados

In [132]:
y_train.value_counts(normalize=True)

diagnosis
0    0.625628
1    0.374372
Name: proportion, dtype: float64

In [133]:
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

print("Shape X_train before:", X_train.shape)
print("Shape X_train_res after:", X_train_res.shape)

y_train_res.value_counts(normalize=True)

Shape X_train before: (398, 30)
Shape X_train_res after: (298, 30)


diagnosis
0    0.5
1    0.5
Name: proportion, dtype: float64

### Tratar escala dos dados

In [134]:
standart_scaler = StandardScaler()
X_train_padr = standart_scaler.fit_transform(X_train_res)
X_test_padr = standart_scaler.transform(X_test)

standart_scaler = StandardScaler()
X_padr = standart_scaler.fit_transform(X_res)

### Redução de dimensionalidade

In [135]:
def print_pca_variance(pca, title="PCA"):
    print(title)
    print("Explained variance ratio:", pca.explained_variance_ratio_[:5])
    print("Sum of explained variance ratio:", sum(pca.explained_variance_ratio_), "\n")

#### Dados completo

In [136]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_res)

pca_padr = PCA(n_components=14)
X_padr_pca = pca_padr.fit_transform(X_padr)

print_pca_variance(pca)
print_pca_variance(pca_padr, "PCA Padr")

PCA
Explained variance ratio: [0.98019972 0.01788607]
Sum of explained variance ratio: 0.9980857994757741 

PCA Padr
Explained variance ratio: [0.43977449 0.19343507 0.09718649 0.06666227 0.04713911]
Sum of explained variance ratio: 0.9838459577428579 



#### Dados de treino/teste

In [137]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test)

pca_padr = PCA(n_components=14)
X_train_padr_pca = pca_padr.fit_transform(X_train_padr)
X_test_padr_pca = pca_padr.transform(X_test_padr)

print_pca_variance(pca)
print_pca_variance(pca_padr, "PCA Padr")

PCA
Explained variance ratio: [0.97962995 0.01809356]
Sum of explained variance ratio: 0.9977235163518947 

PCA Padr
Explained variance ratio: [0.43139626 0.20100885 0.10289088 0.06510546 0.04685948]
Sum of explained variance ratio: 0.9844901344047925 



### Resumo das variáveis

1. Dados de Treino
- X_train: dados originais
- X_train_res: dados balanceados
- X_train_padr: dados balanceados e padronizados
- X_train_pca: dados balanceados com redução de dimensionalidade
- X_train_padr_pca: dados balanceados, padronizados com redução de dimensionalidade

2. Dados de teste
- X_test: dados originais
- X_test_padr: dados balanceados e padronizados
- X_test_pca: dados balanceados com redução de dimensionalidade
- X_test_padr_pca: dados balanceados, padronizados com redução de dimensionalidade

3. Dados completos
- X: dados originais
- X_res: dados balanceados
- X_padr: dados balanceados e padronizados
- X_pca: dados balanceados com redução de dimensionalidade
- X_padr_pca: dados balanceados, padronizados com redução de dimensionalidade

## Treinamento dos modelos

### Naive Bayers

#### Selecionar melhor processamento

In [138]:
X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_train_padr, y_train_res,
                                                          test_size=0.2, random_state=42)

naive_bayes = GaussianNB()
naive_bayes.fit(X_train_2, y_train_2)

train_predict = naive_bayes.predict(X_train_2)
valid_predict = naive_bayes.predict(X_valid)

print("Naive Bayes - Train/Valid")
print("Train Accuracy: ", accuracy_score(y_train_2, train_predict) * 100)
print("Test Accuracy: ", accuracy_score(y_valid, valid_predict) * 100, "\n")

naive_bayes = GaussianNB()
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_result = cross_val_score(naive_bayes, X_train_padr, y_train_res, cv=k_fold, scoring='accuracy')
print("Naive Bayes - Cross Validation (Train)")
print("Mean: ", cv_result.mean() * 100)

Naive Bayes - Train/Valid
Train Accuracy:  92.43697478991596
Test Accuracy:  91.66666666666666 

Naive Bayes - Cross Validation (Train)
Mean:  91.95402298850574


#### Avaliação Treino/Teste

In [139]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_padr, y_train_res)

train_predict = naive_bayes.predict(X_train_padr)
test_predict = naive_bayes.predict(X_test_padr)

print("Naive Bayes - Train/Test")
print("Train Accuracy: ", accuracy_score(y_train_res, train_predict) * 100)
print("Test Accuracy: ", accuracy_score(y_test, test_predict) * 100, "\n")

print(classification_report(y_test, test_predict))

Naive Bayes - Train/Test
Train Accuracy:  92.61744966442953
Test Accuracy:  92.98245614035088 

              precision    recall  f1-score   support

           0       0.94      0.95      0.94       108
           1       0.92      0.89      0.90        63

    accuracy                           0.93       171
   macro avg       0.93      0.92      0.92       171
weighted avg       0.93      0.93      0.93       171



In [140]:
confusion_matrix(y_test, test_predict)

array([[103,   5],
       [  7,  56]])

#### Avaliação Cross Validation

In [141]:
naive_bayes = GaussianNB()
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_result = cross_val_score(naive_bayes, X_padr, y_res, cv=k_fold, scoring='accuracy')
print("Naive Bayes - Cross Validation (Full)")
print("Mean: ", cv_result.mean() * 100)

Naive Bayes - Cross Validation (Full)
Mean:  91.74418604651164


### SVM

#### Selecionar melhor processamento

In [142]:
X_datas = [X_train_res, X_train_padr, X_train_pca, X_train_padr_pca]
names = ["Original", "Padr", "PCA", "PCA Padr"]

for name, X_data in zip(names, X_datas):
    X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_data, y_train_res,
                                                          test_size=0.2, random_state=42)
    print(f"Data: {name}")
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train_2, y_train_2)

    train_predict = naive_bayes.predict(X_train_2)
    valid_predict = naive_bayes.predict(X_valid)

    print("Train Accuracy: ", accuracy_score(y_train_2, train_predict) * 100)
    print("valid Accuracy: ", accuracy_score(y_valid, valid_predict) * 100, "\n")

Data: Original
Train Accuracy:  92.85714285714286
valid Accuracy:  91.66666666666666 

Data: Padr
Train Accuracy:  92.43697478991596
valid Accuracy:  91.66666666666666 

Data: PCA
Train Accuracy:  86.1344537815126
valid Accuracy:  88.33333333333333 

Data: PCA Padr
Train Accuracy:  88.65546218487394
valid Accuracy:  91.66666666666666 



#### Avaliar dados treino/teste

In [143]:
svm = SVC(C=100, kernel='rbf')
svm.fit(X_train_res, y_train_res)

train_predict = svm.predict(X_train_res)
test_predict = svm.predict(X_test)

print("SVM - Train/Test")
print("Train Accuracy: ", accuracy_score(y_train_res, train_predict) * 100)
print("Test Accuracy: ", accuracy_score(y_test, test_predict) * 100, "\n")

print(classification_report(y_test, test_predict))

SVM - Train/Test
Train Accuracy:  90.60402684563759
Test Accuracy:  98.24561403508771 

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       108
           1       0.98      0.97      0.98        63

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



In [144]:
confusion_matrix(y_test, test_predict)

array([[107,   1],
       [  2,  61]])

#### Avaliação Cross Validation

In [145]:
svm = SVC(C=100, kernel='rbf')
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_result = cross_val_score(svm, X_res, y_res, cv=k_fold, scoring='accuracy')
print("SVM - Cross Validation (Full)")
print("Mean: ", cv_result.mean() * 100)

SVM - Cross Validation (Full)
Mean:  92.44186046511628


### Logistic Regression

#### Selecionar melhor processamento

In [146]:
X_datas = [X_train_res, X_train_padr, X_train_pca, X_train_padr_pca]
names = ["Original", "Padr", "PCA", "PCA Padr"]

for name, X_data in zip(names, X_datas):
    X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_data, y_train_res,
                                                          test_size=0.2, random_state=42)
    print(f"Data: {name}")
    logistic_regression = LogisticRegression(max_iter=10000)
    logistic_regression.fit(X_train_2, y_train_2)

    train_predict = logistic_regression.predict(X_train_2)
    valid_predict = logistic_regression.predict(X_valid)

    print("Train Accuracy: ", accuracy_score(y_train_2, train_predict) * 100)
    print("valid Accuracy: ", accuracy_score(y_valid, valid_predict) * 100, "\n")

Data: Original
Train Accuracy:  95.7983193277311
valid Accuracy:  90.0 

Data: Padr
Train Accuracy:  98.73949579831933
valid Accuracy:  93.33333333333333 

Data: PCA
Train Accuracy:  90.33613445378151
valid Accuracy:  90.0 

Data: PCA Padr
Train Accuracy:  98.31932773109243
valid Accuracy:  95.0 



#### Selecionar melhores Hyper parâmetros

In [147]:
lr_params = {'C': [0.1, 1, 10, 100, 1000],
              'penalty': ['l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
              'max_iter': [10000]}

grid_search = GridSearchCV(LogisticRegression(), lr_params, cv=5, scoring='accuracy')
grid_search.fit(X_train_padr_pca, y_train_res)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_ * 100)

Best parameters:  {'C': 1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  97.30508474576271


#### Avaliação Treino/Teste

In [148]:
logistic_regression = LogisticRegression(**grid_search.best_params_)
logistic_regression.fit(X_train_padr_pca, y_train_res)

train_predict = logistic_regression.predict(X_train_padr_pca)
test_predict = logistic_regression.predict(X_test_padr_pca)

print("Logistic Regression - Train/Test")
print("Train Accuracy: ", accuracy_score(y_train_res, train_predict) * 100)
print("Test Accuracy: ", accuracy_score(y_test, test_predict) * 100, "\n")

print(classification_report(y_test, test_predict))

Logistic Regression - Train/Test
Train Accuracy:  97.98657718120806
Test Accuracy:  98.83040935672514 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       108
           1       0.98      0.98      0.98        63

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171



In [149]:
confusion_matrix(y_test, test_predict)

array([[107,   1],
       [  1,  62]])

#### Avaliação Cross validation

In [150]:
logistic_regression = LogisticRegression(**grid_search.best_params_)
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_result = cross_val_score(logistic_regression, X_padr_pca, y_res, cv=k_fold, scoring='accuracy')
print("Logistic Regression - Cross Validation (Full)")
print("Mean: ", cv_result.mean() * 100)

Logistic Regression - Cross Validation (Full)
Mean:  96.22923588039868


### KNN

#### Selecionar melhor processamento

In [151]:
X_datas = [X_train_res, X_train_padr, X_train_pca, X_train_padr_pca]
names = ["Original", "Padr", "PCA", "PCA Padr"]

for name, X_data in zip(names, X_datas):
    X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_data, y_train_res,
                                                          test_size=0.2, random_state=42)
    print(f"Data: {name}")
    knn = KNeighborsClassifier()
    knn.fit(X_train_2, y_train_2)

    train_predict = knn.predict(X_train_2)
    valid_predict = knn.predict(X_valid)

    print("Train Accuracy: ", accuracy_score(y_train_2, train_predict) * 100)
    print("valid Accuracy: ", accuracy_score(y_valid, valid_predict) * 100, "\n")

Data: Original
Train Accuracy:  91.59663865546219
valid Accuracy:  88.33333333333333 

Data: Padr
Train Accuracy:  97.89915966386555
valid Accuracy:  95.0 

Data: PCA
Train Accuracy:  91.59663865546219
valid Accuracy:  91.66666666666666 

Data: PCA Padr
Train Accuracy:  97.89915966386555
valid Accuracy:  93.33333333333333 



#### Selecionar melhores hyper parâmetros

In [152]:
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy')
grid_search.fit(X_train_padr, y_train_res)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_ * 100)

Best parameters:  {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}
Best score:  94.954802259887


#### Avaliação Treino/Teste

In [153]:
knn = KNeighborsClassifier(**grid_search.best_params_)
knn.fit(X_train_padr, y_train_res)

train_predict = knn.predict(X_train_padr)
test_predict = knn.predict(X_test_padr)

print("KNN - Train/Test")
print("Train Accuracy: ", accuracy_score(y_train_res, train_predict) * 100)
print("Test Accuracy: ", accuracy_score(y_test, test_predict) * 100, "\n")

print(classification_report(y_test, test_predict))

KNN - Train/Test
Train Accuracy:  100.0
Test Accuracy:  95.90643274853801 

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       108
           1       0.95      0.94      0.94        63

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



In [154]:
confusion_matrix(y_test, test_predict)

array([[105,   3],
       [  4,  59]])

#### Avaliação Cross Validation

In [155]:
knn = KNeighborsClassifier(**grid_search.best_params_)
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_result = cross_val_score(knn, X_padr_pca, y_res, cv=k_fold, scoring='accuracy')
print("KNN - Cross Validation (Full)")
print("Mean: ", cv_result.mean() * 100)

KNN - Cross Validation (Full)
Mean:  95.04429678848282


### Random Forest

#### Selecionar melhor processamento

In [None]:
X_datas = [X_train_res, X_train_padr, X_train_pca, X_train_padr_pca]
names = ["Original", "Padr", "PCA", "PCA Padr"]

for name, X_data in zip(names, X_datas):
    X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_data, y_train_res,
                                                          test_size=0.2, random_state=42)
    print(f"Data: {name}")
    knn = KNeighborsClassifier()
    knn.fit(X_train_2, y_train_2)

    train_predict = knn.predict(X_train_2)
    valid_predict = knn.predict(X_valid)

    print("Train Accuracy: ", accuracy_score(y_train_2, train_predict) * 100)
    print("valid Accuracy: ", accuracy_score(y_valid, valid_predict) * 100, "\n")