In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split 
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

# Construcción del modelo predictivo 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error  
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestRegressor 

In [2]:
v = pd.read_csv('vivienda.csv') 

In [3]:
v_train, v_test = train_test_split(v, test_size = 0.2, random_state = 42)

In [4]:
v = v_train.drop('precio', axis = 1)
v_labels = v_train[['precio']]   # para que v_labels sea 2D

In [5]:
class AdAtribComb(BaseEstimator, TransformerMixin):
    def __init__(self, ad_dph = True, ad_hph = True, ad_pph = True): # no *args or **kargs
        self.ad_dph = ad_dph
        self.ad_hph = ad_hph
        self.ad_pph = ad_pph
        
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        habitaciones, dormitorios, población, hogares = 3, 4, 5, 6 
        if self.ad_dph:
            dormitorios_por_habitación=X[:,dormitorios]/X[:,habitaciones]
            X=np.c_[X, dormitorios_por_habitación]
        if self.ad_hph:
            habitaciones_por_hogar=X[:,habitaciones]/X[:,hogares]
            X=np.c_[X, habitaciones_por_hogar]
        if self.ad_pph: 
            población_por_hogar = X[:, población]/X[:, hogares]
            X=np.c_[X, población_por_hogar] 
        return X
       

In [6]:
pipeline_num = Pipeline([
    ('imputar', SimpleImputer(strategy = 'median')),
    ('adicionar_atributos', AdAtribComb(ad_dph = True, ad_hph = True, ad_pph = True)),
    ('estandarizar', StandardScaler())
])

In [7]:
v_num = v.drop('proximidad', axis = 1)
lista_atributos_num = list(v_num.columns) 
lista_atributos_cat = ['proximidad']
pipeline_total = ColumnTransformer([
                                  ("num", pipeline_num, lista_atributos_num),
                                  ("cat", OneHotEncoder(), lista_atributos_cat),
                                  ])
x_prep = pipeline_total.fit_transform(v)

# Busquemos el mejor modelo predictor de precio de vivienda

In [8]:
regresor_lineal = LinearRegression() 

In [9]:
regresor_lineal.fit(x_prep, v_labels.values.ravel()) 

LinearRegression()

In [10]:
regresor_lineal.coef_ 

array([-56276.05885186, -56638.23115891,  14122.30475198,   5881.10498027,
         5307.66299678, -46320.28547056,  40233.72484825,  78904.43939684,
        16809.10455136,   7699.44848838,    750.3679429 , -18666.47855331,
       -53617.84037612, 112060.2444519 , -24109.18513167, -15666.74039081])

In [11]:
regresor_lineal.intercept_

237125.77897391893

In [13]:
algunos_datos = v.iloc[:5]

In [15]:
algunos_datos_prep = pipeline_total.transform(algunos_datos)

In [16]:
algunos_datos_prep.shape 

(5, 16)

In [17]:
algunas_predicciones = regresor_lineal.predict(algunos_datos_prep)
algunas_predicciones

array([181746.54359616, 290558.74973505, 244957.50017771, 146498.51061398,
       163230.42393939])

In [18]:
v_labels[:5]

Unnamed: 0,precio
14196,103000.0
8267,382100.0
17445,172600.0
14265,93400.0
2271,96500.0


# Veamos el desempeño en el conjunto de entrenamiento

In [19]:
predicciones_train_total = regresor_lineal.predict(x_prep) 

In [20]:
np.sqrt(mean_squared_error(predicciones_train_total, v_labels))

67593.20745775253

# Construcción de otro modelo para comparar

In [21]:
regresor_tree = DecisionTreeRegressor() 

In [22]:
regresor_tree.fit(x_prep, v_labels) 

DecisionTreeRegressor()

## Veamos el desempeño de este este segundo regresor  en el conjunto de entrenamiento

In [23]:
predicciones_tree = regresor_tree.predict(x_prep) 

In [24]:
np.sqrt(mean_squared_error(predicciones_tree, v_labels)) 

0.0

## Evaluemos el desempeño de estos modelos con la metodología de validación cruzada

In [30]:
puntajes_tree = cross_val_score(regresor_tree, x_prep, v_labels,\
                          scoring = 'neg_mean_squared_error', cv = 10)

In [28]:
np.sqrt(-puntajes) 

array([65685.65501858, 70650.80046409, 68918.88667265, 72846.80432153,
       73312.10849022, 65628.29564683, 66681.71526809, 67533.35655724,
       65619.24561123, 69928.06384812])

In [29]:
def mostrar_puntajes(puntajes):
    print('Puntajes: ',np.sqrt(-puntajes) )
    print('Primedio de los puntajes: ', np.sqrt(-puntajes).mean() )
    print('Desviación estandar: ', np.sqrt(-puntajes).std() )

In [31]:
mostrar_puntajes(puntajes_tree)

Puntajes:  [64321.39925196 71446.91716289 70037.24551339 70008.97891562
 72773.07388903 66678.64638601 66408.59004051 68340.40626132
 67371.23657963 70517.70451163]
Primedio de los puntajes:  68790.41985119844
Desviación estandar:  2475.1348003055773


In [32]:
puntajes_lineal = cross_val_score(regresor_lineal, x_prep, v_labels,\
                          scoring = 'neg_mean_squared_error', cv = 10)

In [33]:
mostrar_puntajes(puntajes_lineal)

Puntajes:  [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Primedio de los puntajes:  67828.38677377408
Desviación estandar:  2468.091395065228


# Un tercer modelo para comparar desempeño

In [34]:
regresor_forest = RandomForestRegressor() 

In [36]:
regresor_forest.fit(x_prep, v_labels.values.ravel()) 

RandomForestRegressor()

In [37]:
predicciones_forest = regresor_forest.predict(x_prep)

In [38]:
np.sqrt(mean_squared_error(predicciones_forest, v_labels))

18503.725782159352

In [39]:
puntajes_forest = cross_val_score(regresor_forest, x_prep, v_labels.values.ravel(),\
                          scoring = 'neg_mean_squared_error', cv = 10)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [40]:
mostrar_puntajes(puntajes_forest) 

Puntajes:  [47007.48062271 51552.7795371  49708.20031694 51903.44798785
 52240.2839639  47176.06878833 47361.64798482 50443.79787821
 49143.81538792 49972.99492433]
Primedio de los puntajes:  49651.05173921147
Desviación estandar:  1863.9208118544893


# Ya elegido el modelo que mejor se desempeña, afinemoslo

In [54]:
param_grid = [
    {'n_estimators':[3, 10, 30,40], 'max_features': [2, 4, 6, 8, 10]},
    {'bootstrap':[False],'n_estimators':[3, 10], 'max_features': [2, 3, 4] }
]

In [55]:
cuadricula = GridSearchCV(regresor_forest, param_grid, return_train_score=True, scoring = 'neg_mean_squared_error', cv = 5) 

In [56]:
cuadricula.fit(x_prep, v_labels.values.ravel()) 

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8, 10],
                          'n_estimators': [3, 10, 30, 40]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [57]:
cuadricula.best_params_

{'max_features': 8, 'n_estimators': 40}

In [58]:
mejor_regresor_forest = RandomForestRegressor(max_features = 8, n_estimators = 40)

In [59]:
mejor_regresor_forest.fit(x_prep, v_labels.values.ravel())

RandomForestRegressor(max_features=8, n_estimators=40)

In [60]:
predicciones_finales = mejor_regresor_forest.predict(x_prep)

In [61]:
np.sqrt(mean_squared_error(predicciones_finales, v_labels))

18741.233031207186

In [62]:
puntajes_forest_mejor = cross_val_score(mejor_regresor_forest,\
                                        x_prep, v_labels.values.ravel(), \
                                        scoring = 'neg_mean_squared_error', cv = 10)

In [63]:
mostrar_puntajes(puntajes_forest_mejor) 

Puntajes:  [47456.10569609 51321.55648378 48730.10613152 50652.7430317
 51523.03829551 46839.24135759 47193.32546764 50424.74201249
 49429.23750662 50270.59007942]
Primedio de los puntajes:  49384.068606236266
Desviación estandar:  1650.1441216299015
