## Valores numéricos no disponibles (*missing values*)
### Modelos de predicción
Existen métodos más avanzados como el uso de **módelos de predicción** (tratando la columna con valores nulos como la variable objetivo y el resto de columnas como *features*). Por ejemplo, podría utilizarse el algoritmo **K-Nearest Neighbors (KNN)** para predecir los valores nulos de 'total_bedrooms' basándonos en los registros sí etiquetados.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

housing = pd.read_csv("./data/housing.csv")
housing
# Tiene más de 20000 líneas


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [2]:

# Generación de conjuntos de entrenamiento y prueba mediante muestreo estratificado por ingreso medio
train_set, test_set = train_test_split(housing, test_size=0.2,
    stratify=pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1, 2, 3, 4, 5])
    )
housing = train_set.drop("median_house_value", axis=1) # Eliminamos la columna de la variable dependiente
housing_labels = train_set["median_house_value"].copy() # Guardamos la variable dependiente (etiquetas)

housing.head().T

Unnamed: 0,5042,1491,13699,1557,11672
longitude,-118.32,-122.02,-117.2,-121.97,-118.02
latitude,33.98,37.95,34.12,37.8,33.84
housing_median_age,49.0,22.0,24.0,17.0,35.0
total_rooms,1412.0,3526.0,3532.0,3279.0,3473.0
total_bedrooms,333.0,510.0,618.0,418.0,563.0
population,901.0,1660.0,1681.0,1222.0,2091.0
households,328.0,508.0,590.0,381.0,580.0
median_income,1.7067,5.6642,3.5,7.9168,4.4821
ocean_proximity,<1H OCEAN,NEAR BAY,INLAND,<1H OCEAN,<1H OCEAN


In [3]:
housing
# Ahora tiene algo más de 16000 líneas

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5042,-118.32,33.98,49.0,1412.0,333.0,901.0,328.0,1.7067,<1H OCEAN
1491,-122.02,37.95,22.0,3526.0,510.0,1660.0,508.0,5.6642,NEAR BAY
13699,-117.20,34.12,24.0,3532.0,618.0,1681.0,590.0,3.5000,INLAND
1557,-121.97,37.80,17.0,3279.0,418.0,1222.0,381.0,7.9168,<1H OCEAN
11672,-118.02,33.84,35.0,3473.0,563.0,2091.0,580.0,4.4821,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
1496,-122.01,37.94,18.0,2077.0,298.0,937.0,292.0,6.3809,NEAR BAY
11483,-118.00,33.73,26.0,2236.0,280.0,809.0,282.0,6.7395,<1H OCEAN
16132,-122.48,37.78,44.0,3371.0,794.0,1738.0,753.0,3.1653,NEAR BAY
20256,-119.17,34.21,33.0,1039.0,256.0,1432.0,272.0,3.1103,NEAR OCEAN


In [34]:


from sklearn.impute import KNNImputer
imputer = KNNImputer()
housing_copia = housing.copy()
#Se eliminan las columnas que no se van a procesar, sólo se tratará una como variable objetivo, en este caso se tratará total_bedrooms
housing_copia = housing_copia.drop(columns="ocean_proximity") #No puede convertir un string a float
housing_tr = imputer.fit_transform(housing_copia) # Hay que asignarlo a una nueva variable
housing_copia = pd.DataFrame(housing_tr, index = housing_copia.index, columns=housing_copia.columns)
np.isnan(housing_copia).sum() #Comprueba que no existan valores nan
#Se recupera toda la información
housing_copia["ocean_proximity"] = housing["ocean_proximity"]
housing_copia.isna().sum() #Se comprueba que todo está en su sitio

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64