# Preparación de los datos

## Importando las librerías

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Resumen del análisis de los datos
Pero la columna **'BI-RADS'** se eliminiará dado que no contribuye a la clasificación. Mostraremos los 10 primeros registros

In [3]:
column_names = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
mammographic_masses = pd.read_csv('mammographic_masses.data.txt', names=column_names)
mammographic_masses.drop('BI-RADS',axis=1,inplace=True)
mammographic_masses.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67.0,3.0,5.0,3.0,1
1,43.0,1.0,1.0,,1
2,58.0,4.0,5.0,3.0,1
3,28.0,1.0,1.0,3.0,0
4,74.0,1.0,5.0,,1
5,65.0,1.0,,3.0,0
6,70.0,,,3.0,0
7,42.0,1.0,,3.0,0
8,57.0,1.0,5.0,3.0,1
9,60.0,,5.0,1.0,1


## Eliminación
En el punto anterior pudimos notar que los datos faltantes muestran un patrón aleatorio en su distribución, por lo cual eliminaremos los datos faltantes usando la función *dropna*

In [4]:
mammographic_masses = mammographic_masses.dropna().reset_index(drop=True).astype(int)
mammographic_masses.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67,3,5,3,1
1,58,4,5,3,1
2,28,1,1,3,0
3,57,1,5,3,1
4,76,1,4,3,1
5,42,2,1,3,1
6,36,3,1,2,0
7,60,2,1,2,0
8,54,1,1,3,0
9,52,3,4,3,0


## Estandarización de las variables
En este paso se procede a convertir los frames de datos de Pandas en matrices Numpy que puede usar scikit_learn.

### Importar la libreria y realizar el ajuste

In [6]:
from sklearn.preprocessing import StandardScaler

In [7]:
standard_scaler = StandardScaler()

In [8]:
standard_scaler.fit(mammographic_masses.drop('Severity',axis=1))

StandardScaler(copy=True, with_mean=True, with_std=True)

### Construyendo las matrices
m_caracteristicas es la matriz con los datos de características (edad, forma, margen y densidad) y también se define la matriz_severidad.

In [9]:
scaled_features = standard_scaler.transform(mammographic_masses.drop('Severity',axis=1))

In [10]:
m_caracteristicas = pd.DataFrame(scaled_features,columns=mammographic_masses.columns[:-1])
m_caracteristicas.head(10)

Unnamed: 0,Age,Shape,Margin,Density
0,0.765804,0.17446,1.395631,0.240313
1,0.151666,0.979883,1.395631,0.240313
2,-1.895458,-1.436386,-1.158927,0.240313
3,0.083429,-1.436386,1.395631,0.240313
4,1.379941,-1.436386,0.756992,0.240313
5,-0.940133,-0.630963,-1.158927,0.240313
6,-1.349558,0.17446,-1.158927,-2.612545
7,0.288141,-0.630963,-1.158927,-2.612545
8,-0.121284,-1.436386,-1.158927,0.240313
9,-0.257759,0.17446,0.756992,0.240313


In [14]:
matriz_severidad = mammographic_masses['Severity']
matriz_severidad.head(10)

0    1
1    1
2    0
3    1
4    1
5    1
6    0
7    0
8    0
9    0
Name: Severity, dtype: int64

In [11]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(mammographic_masses.drop('Severity',axis=1))
mm_scaled_features = min_max_scaler.transform(mammographic_masses.drop('Severity',axis=1))

In [12]:
mm_features = pd.DataFrame(mm_scaled_features,columns=mammographic_masses.columns[:-1])
mm_features.head(10)

Unnamed: 0,Age,Shape,Margin,Density
0,0.628205,0.666667,1.0,0.666667
1,0.512821,1.0,1.0,0.666667
2,0.128205,0.0,0.0,0.666667
3,0.5,0.0,1.0,0.666667
4,0.74359,0.0,0.75,0.666667
5,0.307692,0.333333,0.0,0.666667
6,0.230769,0.666667,0.0,0.333333
7,0.538462,0.333333,0.0,0.333333
8,0.461538,0.0,0.0,0.666667
9,0.435897,0.666667,0.75,0.666667


## Creando los conjuntos de datos procesados
Vamos a exportar los datos limpiados y estandarizados en dos archivos:
* **Limpios:** 'mammographic_masses_clean.csv'
* **Escalamiento Estandarizado:** 'mammographic_masses_standard.csv'
* **Escalamiento Min Max:** 'mammographic_masses_min_max.csv'

In [17]:
mammographic_masses_standard = pd.concat([m_caracteristicas, matriz_severidad],axis=1)
mammographic_masses_standard.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,0.765804,0.17446,1.395631,0.240313,1
1,0.151666,0.979883,1.395631,0.240313,1
2,-1.895458,-1.436386,-1.158927,0.240313,0
3,0.083429,-1.436386,1.395631,0.240313,1
4,1.379941,-1.436386,0.756992,0.240313,1
5,-0.940133,-0.630963,-1.158927,0.240313,1
6,-1.349558,0.17446,-1.158927,-2.612545,0
7,0.288141,-0.630963,-1.158927,-2.612545,0
8,-0.121284,-1.436386,-1.158927,0.240313,0
9,-0.257759,0.17446,0.756992,0.240313,0


In [18]:
mammographic_masses_min_max = pd.concat([mm_features, matriz_severidad],axis=1)
mammographic_masses_min_max.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,0.628205,0.666667,1.0,0.666667,1
1,0.512821,1.0,1.0,0.666667,1
2,0.128205,0.0,0.0,0.666667,0
3,0.5,0.0,1.0,0.666667,1
4,0.74359,0.0,0.75,0.666667,1
5,0.307692,0.333333,0.0,0.666667,1
6,0.230769,0.666667,0.0,0.333333,0
7,0.538462,0.333333,0.0,0.333333,0
8,0.461538,0.0,0.0,0.666667,0
9,0.435897,0.666667,0.75,0.666667,0


In [19]:
mammographic_masses.to_csv('./processed/mammographic_masses_clean.csv', index=False)
mammographic_masses_standard.to_csv('./processed/mammographic_masses_standard.csv', index=False)
mammographic_masses_min_max.to_csv('./processed/mammographic_masses_min_max.csv', index=False)