## Pré-processamento de dados  - Breast Cancer dataset

* https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('breast_cancer_missing.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'breast_cancer_missing.csv'

In [None]:
df.info()

## Limpeza dos Dados

### Dados Faltantes

In [None]:
df.isnull().sum()

In [None]:
df['diagnosis'].isnull().sum()

In [None]:
#Remove os dados faltantes em diagnosis
df.dropna(subset=['diagnosis'],inplace=True)

In [None]:
df['diagnosis'].isnull().sum()

In [None]:
# Obtem o nome das colunas com NaN
# Deixa no formato list
nan_columns = df.columns[df.isna().any()].tolist()

In [None]:
# Percorre cada coluna calculando a média
# Substitui todos os NaN pela média
for col in nan_columns:
  media = df[col].mean()
  print(media)
  df[col].fillna(media, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Remove sample_id
df.drop(['sample_id'], axis=1, inplace=True)

In [None]:
# Transforma os dados categóricos de diagnosis
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['diagnosis'] = labelencoder.fit_transform(df['diagnosis'])

In [None]:
df['diagnosis'].value_counts()

## Normalização de atributos numéricos

In [None]:
scaler_cols = df.columns[0:30]
print(scaler_cols)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ajuste = scaler.fit(df[scaler_cols])
df[scaler_cols] = ajuste.transform(df[scaler_cols])

In [None]:
df.head()

In [None]:
df.info()

#### Heatmap com as correlações entre os atributos

In [None]:
import seaborn as sns
plt.figure(figsize = (15, 10))
sns.heatmap(df.corr(), cmap = 'Blues')
plt.title("Correlações entre os atributos", size = 20)
plt.show()

## Pipeline Sklearn
* Permite a criação de diferentes combinações de técnicas
* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

### Separação entre treino e teste

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [None]:
 !pip install feature-engine

In [None]:
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection

In [None]:
# Formato dataframe e series
X = df.drop(['diagnosis'], axis=1)
y = df['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Configuração das técnicas que serão utilizadas

In [None]:
# forest = RandomForestClassifier(n_estimators = 100)
# forest.fit(X_train, y_train)
# forest_pred = forest.predict(X_test)

# knn = KNeighborsClassifier()
# knn.fit(X_train, y_train)
# knn_pred = knn.predict(X_test)

# Inclusão DecisionTree
from sklearn.tree import DecisionTreeClassifier
forest = DecisionTreeClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [None]:
# Configurações das técnicas de seleção e predição
# forest = RandomForestClassifier(n_estimators = 100)

mi = SelectKBest(mutual_info_classif, k=15)

anova = SelectKBest(f_classif, k=15)

dropC = DropCorrelatedFeatures(
    threshold=0.8,
    method='pearson'
)

smartC = SmartCorrelatedSelection(
    method="pearson",
    threshold=0.8,
    selection_method="variance",
    estimator=None
)

In [None]:
# >>> Randon Forest Analysis
# Mutual Information|
forest1 = Pipeline([('mi', mi),
                ('rf', forest)])
forest1.fit(X_train, y_train)
forest1_pred = forest1.predict(X_test)

# Drop Correlated
forest2 = Pipeline([('dropC', dropC),
                ('rf', forest)])
forest2.fit(X_train, y_train)
forest2_pred = forest2.predict(X_test)

# Smart Correlated
forest3 = Pipeline([('smartC', smartC),
                ('rf', forest)])
forest3.fit(X_train, y_train)
forest3_pred = forest3.predict(X_test)

# Anova
forest4 = Pipeline([('Anova', anova),
                ('rf', forest)])
forest4.fit(X_train, y_train)
forest4_pred = forest4.predict(X_test)

# KNN Analysis
# Mutual Information|
knn1 = Pipeline([('mi', mi),
                ('knn', knn)])
knn1.fit(X_train, y_train)
knn1_pred = knn1.predict(X_test)


### Métricas de avaliação

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import (recall_score,
                             accuracy_score,
                             precision_score,
                             f1_score)

In [None]:
cm = confusion_matrix(y_test, forest_pred)
ConfusionMatrixDisplay(cm).plot()

In [None]:
cm = confusion_matrix(y_test, forest1_pred)
ConfusionMatrixDisplay(cm).plot()

In [None]:
cm = confusion_matrix(y_test, forest2_pred)
ConfusionMatrixDisplay(cm).plot()

In [None]:
cm = confusion_matrix(y_test, forest3_pred)
ConfusionMatrixDisplay(cm).plot()

In [None]:
cm = confusion_matrix(y_test, forest4_pred)
ConfusionMatrixDisplay(cm).plot()

In [None]:
# Acuracidade
print("Pipeline com Randon Forest")
print("ACC (Completo): %.3f" %(accuracy_score(y_test,forest_pred)))
print("ACC (MI): %.3f" %(accuracy_score(y_test,forest1_pred)))
print("ACC: (dropC) %.3f" %(accuracy_score(y_test,forest2_pred)))
print("ACC: (smartC) %.3f" %(accuracy_score(y_test,forest3_pred)))
print("ACC: (Anova) %.3f" %(accuracy_score(y_test,forest4_pred)))
# print("ACC: (KNN-MI) %.3f" %(accuracy_score(y_test,knn1_pred)))
#Revocação
print("Recall (Completo):%.2f" %(recall_score(y_test,forest_pred)))
print("Recall (MI):%.2f" %(recall_score(y_test,forest1_pred)))
print("Recall (dropC):%.2f" %(recall_score(y_test,forest2_pred)))
print("Recall (smartC):%.2f" %(recall_score(y_test,forest3_pred)))
print("Recall (Anova):%.2f" %(recall_score(y_test,forest4_pred)))
#Precisão
print("Precision (Completo):%.2f" %(precision_score(y_test,forest_pred)))
print("Precision (MI):%.2f" %(precision_score(y_test,forest1_pred)))
print("Precision (dropC):%.2f" %(precision_score(y_test,forest2_pred)))
print("Precision (smartC):%.2f" %(precision_score(y_test,forest3_pred)))
print("Precision (Anova):%.2f" %(precision_score(y_test,forest4_pred)))
#F1-score
print("F1-score (Completo):%.2f" %(f1_score(y_test,forest_pred)))
print("F1-score (MI):%.2f" %(f1_score(y_test,forest1_pred)))
print("F1-score (dropC):%.2f" %(f1_score(y_test,forest2_pred)))
print("F1-score (smnartC):%.2f" %(f1_score(y_test,forest3_pred)))
print("F1-score (Anova):%.2f" %(f1_score(y_test,forest4_pred)))


#### Similaridade das Features

In [None]:
featuresMutualInfromation = forest1[:-1].get_feature_names_out()
featuresBruteForce = forest2[:-1].get_feature_names_out()
featuresSmartCorr = forest3[:-1].get_feature_names_out()
featuresAnova = forest4[:-1].get_feature_names_out()

In [None]:
# Mutual Information
print(featuresMutualInfromation)
# Brute Force
print(featuresBruteForce)
# Smart Correlation
print(featuresSmartCorr)
# Anova
print(featuresAnova)

#### Diagrama de Venn (Até 4 conjuntos)
* Biblioteca Vennforest4Py
* https://pypi.org/project/venny4py/

In [None]:
# !pip install milkviz
!pip install venny4py

In [None]:
# import matplotlib_venn as venn
# import milkviz as mv
from venny4py.venny4py import *

In [None]:
#dict of sets
sets = {
    'Mutual_Information': set(featuresMutualInfromation),
    'Brute_Force': set(featuresBruteForce),
    'featuresSmartCorr': set(featuresSmartCorr),
    'featuresAnova': set(featuresAnova)
}

venny4py(sets=sets)

#### Features similares entre as 4 abordagens

In [None]:
set(featuresMutualInfromation).intersection(featuresBruteForce,featuresSmartCorr,featuresAnova)