# Preparación de los datos

## Importando las librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Resumen del análisis de los datos
Pero la columna **'BI-RADS'** se eliminiará dado que no contribuye a la clasificación. Mostraremos los 10 primeros registros

In [29]:
column_names = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
mammographic_masses = pd.read_csv('mammographic_masses.data.txt', names=column_names)
mammographic_masses.drop('BI-RADS',axis=1,inplace=True)
mammographic_masses.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67.0,3.0,5.0,3.0,1
1,43.0,1.0,1.0,,1
2,58.0,4.0,5.0,3.0,1
3,28.0,1.0,1.0,3.0,0
4,74.0,1.0,5.0,,1
5,65.0,1.0,,3.0,0
6,70.0,,,3.0,0
7,42.0,1.0,,3.0,0
8,57.0,1.0,5.0,3.0,1
9,60.0,,5.0,1.0,1


## Eliminación
En el punto anterior pudimos notar que los datos faltantes muestran un patrón aleatorio en su distribución, por lo cual eliminaremos los datos faltantes usando la función *dropna*

In [31]:
mammographic_masses = mammographic_masses.dropna().reset_index(drop=True)
mammographic_masses.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,67.0,3.0,5.0,3.0,1
1,58.0,4.0,5.0,3.0,1
2,28.0,1.0,1.0,3.0,0
3,57.0,1.0,5.0,3.0,1
4,76.0,1.0,4.0,3.0,1
5,42.0,2.0,1.0,3.0,1
6,36.0,3.0,1.0,2.0,0
7,60.0,2.0,1.0,2.0,0
8,54.0,1.0,1.0,3.0,0
9,52.0,3.0,4.0,3.0,0


## Estandarización de las variables
En este paso se procede a convertir los frames de datos de Pandas en matrices Numpy que puede usar scikit_learn.

### Importar la libreria y realizar el ajuste

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()

In [34]:
scaler.fit(mammographic_masses.drop('Severity',axis=1))

StandardScaler(copy=True, with_mean=True, with_std=True)

### Construyendo las matrices
m_caracteristicas es la matriz con los datos de características (edad, forma, margen y densidad) y también se define la matriz_severidad.

In [35]:
scaled_features = scaler.transform(mammographic_masses.drop('Severity',axis=1))

In [36]:
m_caracteristicas = pd.DataFrame(scaled_features,columns=mammographic_masses.columns[:-1])
m_caracteristicas.head(10)

Unnamed: 0,Age,Shape,Margin,Density
0,0.765804,0.17446,1.395631,0.240313
1,0.151666,0.979883,1.395631,0.240313
2,-1.895458,-1.436386,-1.158927,0.240313
3,0.083429,-1.436386,1.395631,0.240313
4,1.379941,-1.436386,0.756992,0.240313
5,-0.940133,-0.630963,-1.158927,0.240313
6,-1.349558,0.17446,-1.158927,-2.612545
7,0.288141,-0.630963,-1.158927,-2.612545
8,-0.121284,-1.436386,-1.158927,0.240313
9,-0.257759,0.17446,0.756992,0.240313


In [37]:
matriz_severidad = mammographic_masses['Severity']
matriz_severidad.head(10)

0    1
1    1
2    0
3    1
4    1
5    1
6    0
7    0
8    0
9    0
Name: Severity, dtype: int64

In [38]:
mammographic_masses_standard = pd.concat([m_caracteristicas, matriz_severidad],axis=1)
mammographic_masses_standard.head(10)

Unnamed: 0,Age,Shape,Margin,Density,Severity
0,0.765804,0.17446,1.395631,0.240313,1
1,0.151666,0.979883,1.395631,0.240313,1
2,-1.895458,-1.436386,-1.158927,0.240313,0
3,0.083429,-1.436386,1.395631,0.240313,1
4,1.379941,-1.436386,0.756992,0.240313,1
5,-0.940133,-0.630963,-1.158927,0.240313,1
6,-1.349558,0.17446,-1.158927,-2.612545,0
7,0.288141,-0.630963,-1.158927,-2.612545,0
8,-0.121284,-1.436386,-1.158927,0.240313,0
9,-0.257759,0.17446,0.756992,0.240313,0


## Creando los conjuntos de datos procesados
Vamos a exportar los datos limpiados y estandarizados en dos archivos:
* **Limpios:** 'mammographic_masses_clean.csv'
* **Estandarizados:** 'mammographic_masses_standard.csv'

In [40]:
mammographic_masses.to_csv('./processed/mammographic_masses_clean.csv', index=False)
mammographic_masses_standard.to_csv('./processed/mammographic_masses_standard.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: './processed/mammographic_masses_clean.csv'