## Pré-processamento de dados  - Breast Cancer dataset

* https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [67]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [68]:
df = pd.read_csv('breast_cancer_missing.csv')

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sample_id                569 non-null    int64  
 1   mean_radius              527 non-null    float64
 2   mean_texture             521 non-null    float64
 3   mean_perimeter           508 non-null    float64
 4   mean_area                502 non-null    float64
 5   mean_smoothness          523 non-null    float64
 6   mean_compactness         510 non-null    float64
 7   mean_concavity           502 non-null    float64
 8   mean_concave_points      514 non-null    float64
 9   mean_symmetry            519 non-null    float64
 10  mean_fractal_dimension   513 non-null    float64
 11  radius_error             511 non-null    float64
 12  texture_error            513 non-null    float64
 13  perimeter_error          505 non-null    float64
 14  area_error               5

## 1) Agrupamento e Amostragem

In [70]:
# Amostragem aleatoria
df.sample(4)


Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
71,859711,8.888,14.64,58.79,244.0,0.09783,0.1531,0.08606,0.02872,0.1902,...,15.67,62.56,,0.1207,0.2436,0.1434,0.04786,0.2254,,benign
361,901041,13.3,21.57,85.24,546.1,0.08582,0.06373,,,0.1815,...,,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658,benign
148,86973702,14.44,15.18,93.97,640.1,,,0.08487,0.05532,0.1724,...,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683,benign
449,911157302,21.1,20.52,,1384.0,0.09684,0.1175,,0.1155,0.1554,...,,168.2,2022.0,0.1368,0.3101,0.4399,0.228,0.2268,0.07425,malignant


In [71]:
# Amostra com quantidades iguais por classe
# 4 amostras aleatórias por classe
# group_keys=False
# df.groupby('diagnosis', group_keys=False).apply(lambda x: x.sample(4))
df.groupby('diagnosis',group_keys=False).sample(4)

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
325,89511502,12.67,17.3,,489.9,0.1028,0.07664,0.03193,0.02107,0.1707,...,21.1,88.7,574.4,0.1384,0.1212,0.102,0.05602,0.2688,0.06888,benign
359,901034301,9.436,18.32,59.82,278.6,0.1009,0.05956,0.0271,0.01406,0.1506,...,25.02,75.79,439.6,0.1333,0.1049,0.1144,,0.2454,,benign
401,904647,11.93,10.91,76.14,442.7,0.08872,0.05242,0.02606,0.01796,0.1601,...,20.14,87.64,589.5,0.1374,0.1575,0.1514,0.06876,0.246,0.07262,benign
473,9113846,,29.97,77.42,465.4,0.07699,0.03398,0.0,0.0,0.1701,...,38.05,85.08,558.9,0.09422,0.05213,0.0,0.0,0.2409,0.06743,benign
135,868202,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,...,33.37,92.04,653.6,0.1419,0.1523,,0.09331,0.2829,0.08067,malignant
164,8712289,23.27,22.04,152.1,1686.0,0.08439,0.1145,0.1324,0.09702,0.1801,...,28.22,184.2,2403.0,0.1228,0.3583,0.3948,0.2346,0.3589,0.09187,malignant
134,867739,18.45,21.91,120.2,1075.0,0.0943,0.09709,0.1153,0.06847,0.1692,...,31.39,145.6,1590.0,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761,malignant
213,881094802,,25.56,114.5,948.0,0.1006,0.1146,,,0.1308,...,28.07,120.4,1021.0,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818,malignant


In [72]:
print(df['diagnosis'].value_counts())

benign       316
malignant    189
Name: diagnosis, dtype: int64


In [73]:
print((df['diagnosis'].value_counts()) / len(df) * 100)

benign       55.536028
malignant    33.216169
Name: diagnosis, dtype: float64


In [74]:
# Amostragem Proporcional (Estratificada)
# Exemplo baseado em 60% das amostras
df.groupby('diagnosis',group_keys=False)['diagnosis'].sample(frac=0.6).value_counts()

benign       190
malignant    113
Name: diagnosis, dtype: int64

In [75]:
df.groupby('diagnosis', as_index=False)[['mean_area', 'worst_area']].agg(['mean','std'])

Unnamed: 0_level_0,mean_area,mean_area,worst_area,worst_area
Unnamed: 0_level_1,mean,std,mean,std
diagnosis,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
benign,464.033684,134.70506,557.475524,162.800566
malignant,990.943558,367.001705,1440.750303,595.849372


## 2) Limpeza dos Dados

### Dados Faltantes

Missing Data ou Missing Values são valores ausentes mostrados como NaN que significa "Not a Number"

#### A sintaxe Python usada:

- Para selecionar NaN: pd.isnull()
- Para substituir valores NaN: df.fillna()
- Para substituir NaN pela média: df.fillna(df.mean(), inplace=True)
- Para substituir valores presentes por outros valores: `pd.replace("valor","novo_valor")
- Para remover os registros NaN: df.dropna(inplace=True)

In [76]:
df.isnull().sum()

sample_id                   0
mean_radius                42
mean_texture               48
mean_perimeter             61
mean_area                  67
mean_smoothness            46
mean_compactness           59
mean_concavity             67
mean_concave_points        55
mean_symmetry              50
mean_fractal_dimension     56
radius_error               58
texture_error              56
perimeter_error            64
area_error                 60
smoothness_error           54
compactness_error          60
concavity_error            59
concave_points_error       45
symmetry_error             58
fractal_dimension_error    52
worst_radius               47
worst_texture              58
worst_perimeter            66
worst_area                 60
worst_smoothness           63
worst_compactness          53
worst_concavity            50
worst_concave_points       52
worst_symmetry             59
worst_fractal_dimension    75
diagnosis                  64
dtype: int64

In [77]:
df['diagnosis'].isnull().sum()

64

In [78]:
#Remove os dados faltantes em diagnosis
df.dropna(subset=['diagnosis'],inplace=True)

In [79]:
df['diagnosis'].isnull().sum()

0

In [80]:
# Obtem o nome das colunas com NaN
# Deixa no formato list
nan_columns = df.columns[df.isna().any()].tolist()

In [81]:
# Percorre cada coluna calculando a média
# Substitui todos os NaN pela média
for col in nan_columns:
  media = df[col].mean()
  print(media)
  df[col].fillna(media, inplace=True)

14.199702355460387
19.27627155172414
92.82754966887418
655.7441964285715
0.09656095032397409
0.10429558758314857
0.09061060135135138
0.05012960352422907
0.18145119305856833
0.06304136563876651
0.4074896929824561
1.205546799116998
2.9253336283185836
40.40882926829268
0.007074840611353712
0.026135179600886918
0.032873957743362825
0.012027144708423324
0.020362975770925107
0.003854444857768052
16.408450216450216
25.57175824175824
107.61477876106196
880.6248337028826
0.13211443458980043
0.25396089324618737
0.2738400715835141
0.11536521710526315
0.2903896247240618
0.08413929061784897


In [82]:
df.isnull().sum()

sample_id                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
diagnosis                  0
dtype: int64

In [83]:
df[nan_columns]

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
0,17.990000,10.38,122.80,655.744196,0.1184,0.277600,0.300100,0.14710,0.181451,0.078710,...,25.38,17.33,184.60,2019.000000,0.162200,0.6656,0.27384,0.265400,0.46010,0.118900
2,19.690000,21.25,130.00,1203.000000,0.1096,0.104296,0.090611,0.12790,0.206900,0.059990,...,23.57,25.53,152.50,1709.000000,0.144400,0.4245,0.45040,0.243000,0.29039,0.087580
3,11.420000,20.38,77.58,386.100000,0.1425,0.283900,0.090611,0.10520,0.259700,0.097440,...,14.91,26.50,98.87,567.700000,0.209800,0.8663,0.68690,0.257500,0.66380,0.173000
4,20.290000,14.34,135.10,1297.000000,0.1003,0.132800,0.090611,0.10430,0.180900,0.058830,...,22.54,16.67,152.20,1575.000000,0.137400,0.2050,0.40000,0.162500,0.23640,0.076780
5,12.450000,15.70,82.57,477.100000,0.1278,0.170000,0.157800,0.08089,0.208700,0.076130,...,15.47,23.75,103.40,741.600000,0.179100,0.5249,0.53550,0.174100,0.39850,0.124400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562,15.220000,30.62,103.40,655.744196,0.1048,0.208700,0.255000,0.09429,0.212800,0.063041,...,17.52,42.79,128.70,915.000000,0.141700,0.7917,1.17000,0.235600,0.40890,0.140900
563,20.920000,25.09,143.00,1347.000000,0.1099,0.223600,0.317400,0.14740,0.214900,0.068790,...,24.29,29.41,179.10,880.624834,0.132114,0.4186,0.65990,0.254200,0.29290,0.098730
564,21.560000,22.39,142.00,1479.000000,0.1110,0.115900,0.243900,0.13890,0.172600,0.056230,...,25.45,26.40,166.10,880.624834,0.141000,0.2113,0.41070,0.221600,0.20600,0.084139
565,20.130000,28.25,131.20,1261.000000,0.0978,0.103400,0.144000,0.09791,0.175200,0.055330,...,23.69,38.25,155.00,1731.000000,0.116600,0.1922,0.32150,0.115365,0.25720,0.066370


In [84]:
# Forma otimizada
df[nan_columns] = df[nan_columns].fillna('mean')
# df.fillna(df[nan_columns].mean(), inplace=True)

In [85]:
df.isnull().sum()

sample_id                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
diagnosis                  0
dtype: int64

### Dados Faltantes com SimpleImputer
* https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
* https://medium.com/data-hackers/tratamento-e-transforma%C3%A7%C3%A3o-de-dados-nan-uma-vis%C3%A3o-geral-e-pr%C3%A1tica-54efa9fc7a98

In [86]:
df = pd.read_csv('breast_cancer_missing.csv')

In [87]:
df.dropna(subset=['diagnosis'],inplace=True)

In [88]:
nan_columns = df.columns[df.isna().any()].tolist()

In [89]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
df[nan_columns] = imputer.fit_transform(df[nan_columns])

In [90]:
df.isnull().sum()

sample_id                  0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
mean_compactness           0
mean_concavity             0
mean_concave_points        0
mean_symmetry              0
mean_fractal_dimension     0
radius_error               0
texture_error              0
perimeter_error            0
area_error                 0
smoothness_error           0
compactness_error          0
concavity_error            0
concave_points_error       0
symmetry_error             0
fractal_dimension_error    0
worst_radius               0
worst_texture              0
worst_perimeter            0
worst_area                 0
worst_smoothness           0
worst_compactness          0
worst_concavity            0
worst_concave_points       0
worst_symmetry             0
worst_fractal_dimension    0
diagnosis                  0
dtype: int64

## 3) Normalização de atributos numéricos

A normalização por reescala define, através de um valor mínimo e um valor máximo, um novo intervalo onde os valores de um atributo estarão contidos. Tipicamente, tal intervalo é definido como [0, 1]. Portanto, para este caso, a normalização por reescala de um atributo $j$ de um objeto $x_i$ pode ser calculada como:

$$ x_{ij} = \frac{x_{ij} - min_j}{max_j - min_j} $$

sendo min_j e max_j , nessa ordem, os valores mínimo e máximo do atributo $j$ para o conjunto de dados considerado.

Na normalização por padronização, os diferentes atributos contínuos poderão abranger diferentes intervalos, mas deverão possuir os mesmos valores para alguma medida de posição e de espalhamento/variação.
Essas medidas irão consistir na média e no desvio-padrão. Neste caso, o valor normalizado de um atributo $j$
em um objeto $i$ é dado por:

$$ x_{ij} = \frac{x_{ij} - \bar{x}_j}{\sigma_j} $$

In [91]:
scaler_cols = df.columns[1:31]
print(scaler_cols)

Index(['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
       'mean_smoothness', 'mean_compactness', 'mean_concavity',
       'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
       'radius_error', 'texture_error', 'perimeter_error', 'area_error',
       'smoothness_error', 'compactness_error', 'concavity_error',
       'concave_points_error', 'symmetry_error', 'fractal_dimension_error',
       'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
       'worst_smoothness', 'worst_compactness', 'worst_concavity',
       'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension'],
      dtype='object')


In [92]:
df.head()

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,842302,17.99,10.38,122.8,546.2,0.1184,0.2776,0.3001,0.1471,0.1794,...,17.33,184.6,2019.0,0.1622,0.6656,0.2299,0.2654,0.4601,0.1189,malignant
2,84300903,19.69,21.25,130.0,1203.0,0.1096,0.09462,0.064905,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.2812,0.08758,malignant
3,84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.064905,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.064905,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant
5,843786,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,malignant


In [93]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
ajuste = scaler.fit(df[scaler_cols])
df[scaler_cols] = ajuste.transform(df[scaler_cols])

In [94]:
df.head()

Unnamed: 0,sample_id,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,diagnosis
0,842302,1.101047,-2.161918,1.305045,-0.290872,1.626284,3.522546,2.760625,2.584663,-0.071097,...,-1.482352,2.378362,2.132438,1.465195,2.776026,-0.200932,2.391386,2.872607,2.149832,malignant
2,84300903,1.587163,0.491054,1.612411,1.675016,0.972572,-0.174592,-0.293478,2.081046,0.973062,...,-0.005705,1.399688,1.562242,0.599901,1.165081,0.903624,2.037562,-0.138758,0.241983,malignant
3,84348301,-0.777651,0.278718,-0.625385,-0.770072,3.416563,3.649838,-0.293478,1.485624,2.977846,...,0.168971,-0.235401,-0.537002,3.779128,4.117032,2.08833,2.2666,6.301422,5.445317,malignant
4,84358402,1.758734,-1.195426,1.830129,1.956371,0.281717,0.59684,-0.293478,1.462017,-0.014143,...,-1.601204,1.390541,1.315769,0.259616,-0.30154,0.651154,0.766008,-0.892862,-0.415895,malignant
5,843786,-0.483122,-0.8635,-0.412364,-0.497697,2.324567,1.348472,0.912802,0.847971,1.041407,...,-0.326246,-0.097289,-0.21714,2.286738,1.835918,1.329918,0.949238,1.835714,2.484863,malignant


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 505 entries, 0 to 567
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sample_id                505 non-null    int64  
 1   mean_radius              505 non-null    float64
 2   mean_texture             505 non-null    float64
 3   mean_perimeter           505 non-null    float64
 4   mean_area                505 non-null    float64
 5   mean_smoothness          505 non-null    float64
 6   mean_compactness         505 non-null    float64
 7   mean_concavity           505 non-null    float64
 8   mean_concave_points      505 non-null    float64
 9   mean_symmetry            505 non-null    float64
 10  mean_fractal_dimension   505 non-null    float64
 11  radius_error             505 non-null    float64
 12  texture_error            505 non-null    float64
 13  perimeter_error          505 non-null    float64
 14  area_error               5