## Primero de todo los imports

In [72]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns

np.random.seed(100)


# Creamos un directiorio para la salida
import os
if not os.path.exists('output'):
    os.makedirs('output')



# Para hacer test y train
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# Para ajustar los hiper-parámetros
from sklearn.model_selection import GridSearchCV

# Preproceso
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

# Función de error
from sklearn.metrics import mean_squared_error, r2_score

# Funciones

In [60]:
#Function to display score results from CV
def display_scores(scores,model_name = None):
    if(model_name):
        print("----",model_name,"----")
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
def apply_gridsearch(model,params,X_train_gs,y_train_gs):
    grid_search = GridSearchCV(model, param_grid=params, cv=5,
                               scoring='neg_root_mean_squared_error',
                               return_train_score=True,n_jobs=-1)
    grid_search.fit(X_train_gs, y_train_gs)

    print("Best trained model:")
    print(grid_search.best_estimator_)
    print("Best parameters:")
    print(grid_search.best_params_)
    print("Best Score")
    print(np.sqrt(-grid_search.best_score_),-grid_search.best_score_)
    
def create_att_dic(list_att):
    dic_att={}
    for key in initial_atr.keys():
        dic_att[key]=[]
    for (k,v) in list_att:
        dic_att[k].append(v)
    return dic_att

def feature_selection(model, dic_):
    num_log_att = ['Landsize','BuildingArea']
    num_att = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car','YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
    num_poli_att = ['Rooms','BuildingArea','Bathroom']
    #list(housing_num)
    cat_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']
    cat2mean_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']
    
    
    initial_atr = {
        "num_log_att":num_log_att.copy() ,
        "num_att":num_att.copy() ,
        "num_poli_att":num_poli_att.copy() ,
        "cat_att":cat_att.copy() ,
        "cat2mean_att":cat2mean_att.copy() ,
    }

    initial_atr = {'num_log_att': ['Landsize', 'BuildingArea'],
                   'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'],
                   'num_poli_att': ['Rooms', 'BuildingArea'], 
                   'cat_att': ['Type', 'Regionname', 'Postcode'], 
                   'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']
                  }

    initial_full_pipe = ColumnTransformer([
                ("num0", rf_pipe_num0, initial_atr["num_log_att"]),
                ("num1", rf_pipe_num1, initial_atr["num_att"]),
                ("poli", rf_pipe_poli, initial_atr["num_poli_att"]),
                ("cat",  rf_cat_pipe, initial_atr["cat_att"]),
                ("cat_2_mean",rf_cat2mean_pipe, initial_atr["cat2mean_att"])
    ])
    
    start = time.time()

    X_train_initial = initial_full_pipe.fit_transform(X_train,y_train)
    previous_score_winner = -cross_val_score(model, X_train_initial, housing_labels,scoring="neg_root_mean_squared_error", cv=5).mean()

    actual_list_atts = []    
    for k in initial_atr.keys():
        for v in initial_atr[k]:
            actual_list_atts.append((k,v))
    
    buscando = True
    cont=0
    while buscando and len(actual_list_atts)!=0:

        end = time.time()
        tiempo=end - start
        start = end

        dic_=create_att_dic(actual_list_atts)
        print(cont,". Score: ",previous_score_winner)
        print("Tiempo (s) :" ,tiempo )
        print(len(dic_)," Att. ",previous_score_winner, dic_)

        scores_ronda=[]
        for at in actual_list_atts:
            actual_list_without_at = actual_list_atts.copy()
            actual_list_without_at.remove(at)
            #Aqui hemos quitado la feature
            dict_att=create_att_dic(actual_list_without_at)
            #Creamos la nueva pipe sin dicho atributo
            actual_full_pipe = ColumnTransformer([
                ("num0", rf_pipe_num0, dict_att["num_log_att"]),
                ("num1", rf_pipe_num1, dict_att["num_att"]),
                ("poli", rf_pipe_poli, dict_att["num_poli_att"]),
                ("cat",  rf_cat_pipe, dict_att["cat_att"]),
                ("cat_2_mean",rf_cat2mean_pipe, dict_att["cat2mean_att"])
            ])
            #Procesamos los datos con la nueva pipe
            X_train_actual = actual_full_pipe.fit_transform(X_train,y_train)
            #Calculamos con el modelo y guardamos como de bueno es
            actual_score = - cross_val_score(model, X_train_actual, housing_labels,scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1).mean()
            scores_ronda.append((actual_score,at))

        winner = min(scores_ronda, key= lambda x : x[0])
        #Aquí tenemos el mejor score y el atributo que hay que quitar para tenerlo
        score_winner, att_winner = winner

        #Si este resultado es mejor que el mejor de la ronda anterior
        if score_winner < previous_score_winner:
            actual_list_atts.remove(att_winner)
            previous_score_winner = score_winner

        #Si no es mejor hemos acabado
        else:
            buscando=False

    final_att = create_att_dic(actual_list_atts)
    print("ACABADO: ")
    print(final_att)

# Los datos

In [61]:
# Cargamos el dataset
housing = pd.read_csv('dataset/housing-snapshot/train_set.csv',index_col=0) 
print("Número de casas:",housing.shape[0]," Número de características:", housing.shape[1])
housing_num = housing.select_dtypes(exclude=[np.object]).columns
housing_cat = housing.select_dtypes(include=[np.object]).columns
print("\n Características numéricas: \n ", list(housing_num))

print("Características categóricas: \n", list(housing_cat))

Número de casas: 5432  Número de características: 21

 Características numéricas: 
  ['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
Características categóricas: 
 ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']


In [62]:

# Ponemos "Postcode" a categorical, dado que que no es numérica.
housing['Postcode'] = pd.Categorical(housing.Postcode)

# Dividimos haciendo uso de la estratificación para tener una buena proporción.
housing["price_aux"] = pd.cut(housing["Price"],
                               bins=[0., 500000, 1000000, 1500000, 2000000., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["price_aux"]):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

# Quitamos la etiqueta price_cat
train_set.drop("price_aux", axis=1, inplace=True)
test_set.drop("price_aux", axis=1, inplace=True)
housing.drop("price_aux", axis=1, inplace=True)

# El con el train probaremos los modelos.
X_train = train_set.drop("Price", axis=1).copy()
y_train = train_set["Price"].copy()
# El test no lo tenemos que tocar hasta el final de todo.
X_test = test_set.drop("Price", axis=1).copy()
y_test = test_set["Price"].copy()


housing_num = X_train.select_dtypes(exclude=[np.object]).columns
housing_cat = X_train.select_dtypes(include=[np.object]).columns

## Las funciones de las pipelines

In [73]:
## create a function to replace 0 by NaN
def replace_0_2_NaN(data):
    data[data == 0] = np.nan
    return data


# column index
Rooms_ix, Bedroom2_ix, Bathroom_ix, BuildingArea_ix = 0, 2, 3, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        rooms_per_building_area = X[:, Rooms_ix] / (1.0 +X[:, BuildingArea_ix])# add 1 to avoid 0 division
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, Bedroom2_ix] / (1.0 + X[:, Bathroom_ix]) # add 1 to avoid 0 division
            return np.c_[X, rooms_per_building_area, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_building_area]


class DividedAtributes(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        proportion1 = X[:, 0] / (1.0 +X[:, 1])# add 1 to avoid 0 division
        proportion2 = np.ones(proportion1.shape, dtype=np.float)/proportion1
        return np.c_[proportion1,proportion1]

    
class invert_variable(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        data=np.zeros(X.shape,dtype=np.float)
        for i in range(X.shape[1]):
            data[:, 0]=1/X[:, 0]
        return np.c_[X,data]
class Categorical_2_mean(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        self.medias_por_columnas=[]
        self.media_y=0
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        self.media_y = y.mean()
        self.medias_por_columnas=[]
        #Para cada columna
        for i in range(X.shape[1]):
            medias = {}
            columna = X[:,i]
            unicos = np.unique(columna)
            #Para cada valor guardo la media
            for u in unicos:
                medias[u]=y[columna==u].mean()
                
            self.medias_por_columnas.append(medias)
            
        return self  # nothing else to do
    
    def transform(self, X):
        data = np.ones((X.shape[0]))
        
        for i in range(X.shape[1]):
            columna = X[:,i]
            media = self.medias_por_columnas[i]
            nueva_columna = np.zeros(X.shape[0])+self.media_y
            
            #Para cada valor distinto dentro de la categoría
            for k in media.keys():
                nueva_columna[columna == k] = media[k]
            
            #Si es la primera vez
            if i == 0:
                data = nueva_columna
            else:
                data = np.c_[data,nueva_columna]
        return data

class Clean_Outlayers_Quantile(BaseEstimator, TransformerMixin):
    def __init__(self,q=0.01): # no *args or **kargs
        self.q=q
        self.low_q_col=[]
        self.high_q_col=[]
        #self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        #Para cada columna
        for i in range(X.shape[1]):
            columna = X[:,i]
            self.high_q_col.append(np.quantile(a=columna , q=1-self.q))
            self.low_q_col.append(np.quantile(a=columna ,q=self.q))
        return self  # nothing else to do
    
    def transform(self, X):
        for i in range(X.shape[1]):
            q_high=self.high_q_col[i]
            q_low =self.low_q_col[i]
            columna = X[:,i]
            columna[columna>q_high]=q_high
            columna[columna<q_low]=q_low
        return X    

# Ejemplo de PipeLines
"""
1.Para las columnas con muchos ceros sin sentido. 
Además se les aplicará la función logaritmo.
Las columnas que tienen sentido en esta pipe son "BuildingArea" y "Landsize" 
"""
num0_pipeline = Pipeline([
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ('imputer', SimpleImputer(strategy="median")),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
])

# 2.Para las otras columnas numéricas
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
])



"""
Para las variables categóricas. Principalmente utilizaremos el OneHotEncoder
Esta pipe convierte columnas categóricas en numéricas creando una columna de 
1's y 0's por cada valor único en la columna original.

Ejemplo:
Original "Type":
Type
h
t
h
u
h

Dummies from "Type":
h t u
1 0 0
0 1 0
1 0 0
0 0 1
1 0 0

Esto sube muchísimo la dimensionalidad si la columna original
tiene muchos valores distintos, así que hay que tratarlo con cuidado.

"""
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])
num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = list(housing_num)
cat_attribs = ["CouncilArea",'Type','Suburb','Postcode']
# Esta es una especie de Pipeline madre que coge otras pipelines y procesa
# todo el data set. Cada pipeline se aplica a las columnas de la lista
# num0_pipeline con las columnas en la lista num_attribs0
full_pipeline_ejemplo = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
])


# Los modelos

Vamos a tratar unos modelos de machine learning y en concreto haremos uso de la famosa librería Sklearn. En cada uno de los distintos modelos vamos a mostrar primero una implementación naíf y luego vamos a hacer un preproceso de datos personalizado, una feature selection y cuando sea posible una optimización de los hiper-parámetros para conseguir los mejores resultados posibles.

In [64]:
# KNN
from sklearn import neighbors    
# Linear Regression
from sklearn.linear_model import LinearRegression
# Random Forest
from sklearn.ensemble import RandomForestRegressor

# Implementación naíf

In [39]:


#Para las otras columnas numéricas
num_naif_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
#Para las variables categóricas. Principalmente utilizaremos el OneHotEncoder
cat_naif_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ])
"""
Columnas numéricas:  ['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 
'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

Columnas categóricas:  ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']
"""

#Estas son las columnas seleccionadas en el proyecto base para cada pipeline.
num_attribs = list(housing_num)
cat_attribs = ["CouncilArea",'Type','Suburb','Postcode']

#Creamos la "full_pipeline", es decir la pipeline que engloba a todas las otras.
full_naif_pipeline = ColumnTransformer([
        ("num", num_naif_pipeline, num_attribs),
        ("cat", cat_naif_pipeline, cat_attribs),
])
#Preprocesamos los datos utilizando la "full_pipeline"
housing_prepared = full_naif_pipeline.fit_transform(X_train,y_train)

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, y_train)

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, y_train)


n_neighbors = 3
knn_reg = neighbors.KNeighborsRegressor(n_neighbors)
knn_reg.fit(housing_prepared, y_train)

models = [(lin_reg,"lin_reg"),
          (knn_reg,"KNN-Regressor"),
          (forest_reg,'Random Forest')
]


for model in models:
    scores = cross_val_score(model[0], housing_prepared, y_train, scoring="neg_root_mean_squared_error", cv=5,n_jobs=-1)
    display_scores(-scores, model[1])

---- lin_reg ----
Mean: 384025.51472573285
Standard deviation: 48252.21371717167
---- KNN-Regressor ----
Mean: 375914.6862240796
Standard deviation: 31220.851831198084
---- Random Forest ----
Mean: 310410.0826121716
Standard deviation: 36235.270271245245


Tarda mucho en ejecutarse (Si quieres ejecutarlo simplemente descomenta el for que está justo al final de la celda anterior.), así que aquí pongo los resultados:

    -lin_reg:
        Mean: 384025.51472573285
        Standard deviation: 48252.21371717167
    -KNN-Regressor:
        Mean: 375914.6862240796
        Standard deviation: 31220.851831198084
    -Random Forest ----
        Mean: 310410.0826121716
        Standard deviation: 36235.270271245245
        
Estos serán nuestros resultados base, nuestro punto de referencia.

# 1. KNN

In [53]:
X_train_knn = X_train.copy()
y_train_knn = y_train.copy()



num_pipe_log  = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ("clean_outlayer",Clean_Outlayers_Quantile()),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
])

num_pipe      = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ("clean_outlayer",Clean_Outlayers_Quantile()),
        ('std_scaler', StandardScaler()),
])

cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

# Características seleccionadas:
num_features = ['Rooms',"Bathroom","Lattitude","Longtitude"]
log_features = ['Landsize']
cat_features = ['Type']

full_pipe_knn = ColumnTransformer([
        ("num_log", num_pipe_log, log_features),
        ("num", num_pipe, num_features),
        ("cat", cat_pipe, cat_features),
])

X_train_knn_prepared=full_pipe_knn.fit_transform(X_train_knn,y_train_knn)
print("Dataset original:",X_train_knn.shape)
print("Dataset procesado:",X_train_knn_prepared.shape)

Dataset original: (4345, 20)
Dataset procesado: (4345, 8)


In [54]:


scores = cross_val_score(knn_reg, X_train_knn_prepared, y_train_knn, scoring="neg_root_mean_squared_error", cv=5,n_jobs=-1)
display_scores(-scores, knn_reg)

---- KNeighborsRegressor(n_neighbors=3) ----
Mean: 358312.70383498963
Standard deviation: 23930.486517285564


    KNeighborsRegressor(n_neighbors=3)
        -Mean: 358312.70383498963
        -Standard deviation: 23930.486517285564

Creemos que estos resultados se pueden mejorar un poco más haciendo uso de la función GridSearch CV.

In [56]:
knn_param_grid={ "n_neighbors": range(1,20,1)
    
}

apply_gridsearch(model=knn_reg,
                 params=knn_param_grid,
                 X_train_gs=X_train_knn_prepared,
                 y_train_gs=y_train_knn)

Best trained model:
KNeighborsRegressor()
Best parameters:
{'n_neighbors': 5}
Best Score
595.4468825205093 354556.9899033933


# 2. Regresión Lineal

In [74]:
X_train_lr = X_train.copy()
y_train_lr = y_train.copy()
# X_train_lr,y_train_lr

In [78]:
# Primero el preprocesamiento
rf_pipe_num0 = Pipeline([
    ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
    ("fill_nan",SimpleImputer(strategy="mean")),
    ("clean_outlayer",Clean_Outlayers_Quantile()),
    ('log',FunctionTransformer(np.log1p, validate=True)),
    ("std",StandardScaler()),
])

rf_pipe_num1 = Pipeline([
    ("fill_nan",SimpleImputer(strategy="mean")),
    ("clean_outlayer",Clean_Outlayers_Quantile()),
    ("std",StandardScaler()),
])

rf_pipe_poli = Pipeline([
    ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
    ("fill_nan",SimpleImputer(strategy="mean")),
    ('zeros2NaN_2',FunctionTransformer(func = replace_0_2_NaN,validate=False)),# por si acaso hay algun 0
    #("divided",DividedAtributes()),
    ("invert_1/var",invert_variable()),
    ("poly_interact_2",PolynomialFeatures(interaction_only=True,degree=2)),
    ("std",StandardScaler()),
])

rf_cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
])

#cat2mean = Categorical_2_mean(minim_instances=10)
rf_cat2mean_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('cat2mean', Categorical_2_mean()),
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ("std",StandardScaler()),
])

In [82]:
"""
NUMERICAL:    ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
CATEGORICAL:  ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']
"""
num_log_att = ['Landsize','BuildingArea']
num_att = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car',
'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
num_poli_att = ['Rooms','BuildingArea','Bathroom']

cat_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']
cat2mean_att = ['Type',"CouncilArea",'Suburb','Regionname','Postcode']

initial_atr = {
    "num_log_att":num_log_att.copy() ,
    "num_att":num_att.copy() ,
    "num_poli_att":num_poli_att.copy() ,
    "cat_att":cat_att.copy() ,
    "cat2mean_att":cat2mean_att.copy() ,
}
"""
# Descomentar esto si no se quiere esperar a que se ejecute toda la feature selection.
initial_atr = {
                'num_log_att': ['Landsize', 'BuildingArea'], 
                'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 
                'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 
                'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']
}

"""
initial_full_pipe = ColumnTransformer([
            ("num0", rf_pipe_num0, initial_atr["num_log_att"]),
            ("num1", rf_pipe_num1, initial_atr["num_att"]),
            ("poli", rf_pipe_poli, initial_atr["num_poli_att"]),
            ("cat",  rf_cat_pipe, initial_atr["cat_att"]),
            ("cat_2_mean",rf_cat2mean_pipe, initial_atr["cat2mean_att"])
])

model = LinearRegression()

start = time.time()

X_train_initial = initial_full_pipe.fit_transform(X_train_lr,y_train_lr)
previous_score_winner = -cross_val_score(model, X_train_initial, y_train_lr,scoring="neg_root_mean_squared_error", cv=5,n_jobs=-1).mean()

actual_list_atts = []    
for k in initial_atr.keys():
    for v in initial_atr[k]:
        actual_list_atts.append((k,v))


def create_att_dic(list_att):
    dic_att={}
    for key in initial_atr.keys():
        dic_att[key]=[]
    for (k,v) in list_att:
        dic_att[k].append(v)
    return dic_att
        

buscando = True
cont=0
while buscando and len(actual_list_atts)!=0:
    
    end = time.time()
    tiempo=end - start
    start = end
    
    dic_=create_att_dic(actual_list_atts)
    print(cont,". Score: ",previous_score_winner)
    print("Tiempo (s) :" ,tiempo )
    print(len(dic_)," Att. ",previous_score_winner, dic_)

    scores_ronda=[]
    for at in actual_list_atts:
        actual_list_without_at = actual_list_atts.copy()
        actual_list_without_at.remove(at)
        #Aqui hemos quitado la feature
        dict_att=create_att_dic(actual_list_without_at)
        #Creamos la nueva pipe sin dicho atributo
        actual_full_pipe = ColumnTransformer([
            ("num0", rf_pipe_num0, dict_att["num_log_att"]),
            ("num1", rf_pipe_num1, dict_att["num_att"]),
            ("poli", rf_pipe_poli, dict_att["num_poli_att"]),
            ("cat",  rf_cat_pipe, dict_att["cat_att"]),
            ("cat_2_mean",rf_cat2mean_pipe, dict_att["cat2mean_att"])
        ])
        #Procesamos los datos con la nueva pipe
        X_train_actual = actual_full_pipe.fit_transform(X_train_lr,y_train_lr)
        #Calculamos con el modelo y guardamos como de bueno es
        actual_score = - cross_val_score(model, X_train_actual, y_train_lr,scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1).mean()
        scores_ronda.append((actual_score,at))
    
    winner = min(scores_ronda, key= lambda x : x[0])
    #Aquí tenemos el mejor score y el atributo que hay que quitar para tenerlo
    score_winner, att_winner = winner
    
    #Si este resultado es mejor que el mejor de la ronda anterior
    if score_winner < previous_score_winner:
        actual_list_atts.remove(att_winner)
        previous_score_winner = score_winner
        
    #Si no es mejor hemos acabado
    else:
        buscando=False
    
final_att = create_att_dic(actual_list_atts)
print("ACABADO: ")
print(final_att)

0 . Score:  365278.50828061753
Tiempo (s) : 0.35442090034484863
5  Att.  365278.50828061753 {'num_log_att': ['Landsize', 'BuildingArea'], 'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']}
ACABADO: 
{'num_log_att': ['Landsize', 'BuildingArea'], 'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']}


In [86]:
atr_final_lr = {
                'num_log_att': ['Landsize', 'BuildingArea'], 
                'num_att': ['Rooms', 'Distance', 'Bathroom', 'Car', 'YearBuilt', 'Lattitude'], 
                'num_poli_att': ['Rooms', 'BuildingArea'], 'cat_att': ['Type', 'Regionname', 'Postcode'], 
                'cat2mean_att': ['Type', 'CouncilArea', 'Suburb', 'Regionname', 'Postcode']
}

final_full_pipe_lr = ColumnTransformer([
            ("num0", rf_pipe_num0, atr_final_lr["num_log_att"]),
            ("num1", rf_pipe_num1, atr_final_lr["num_att"]),
            ("poli", rf_pipe_poli, atr_final_lr["num_poli_att"]),
            ("cat",  rf_cat_pipe, atr_final_lr["cat_att"]),
            ("cat_2_mean",rf_cat2mean_pipe, atr_final_lr["cat2mean_att"])
])

In [88]:

linear_regressor = LinearRegression()
# Preparamos los datos con la pipe final
X_train_prepared = actual_full_pipe.fit_transform(X_train_lr,y_train_lr)
# Ejecutamos el modelo
final_score = cross_val_score(linear_regressor, X_train_prepared, y_train_lr,scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1)
display_scores(-final_score, "Linear Regresor")

---- Linear Regresor ----
Mean: 365329.37419981335
Standard deviation: 37261.00636141085


# 3. Random Forest

In [43]:
X_train_rf = X_train.copy()
y_train_rf = y_train.copy()
model = RandomForestRegressor(random_state=100)

In [44]:
# Esta será la pipeline para el modelo Random Forest

#Para las columnas con muchos ceros sin sentido. A las que además se les aplicará la función logaritmo
num0_pipeline = Pipeline([
        ('zeros2NaN',FunctionTransformer(func = replace_0_2_NaN,validate=False)),
        ('imputer', SimpleImputer(strategy="median")),
        ('log',FunctionTransformer(np.log1p, validate=True)),
        ('std_scaler', StandardScaler()),
    ])
#Para las otras columnas numéricas
num1_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
#Para las variables categóricas. Principalmente utilizaremos el OneHotEncoder
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant",fill_value='Unknown')),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore')),
    ])

"""
Columnas numéricas:  ['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 
'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']

Columnas categóricas:  ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']
"""

#Estas son las columnas seleccionadas por mi para cada pipeline.
num_attribs0 = ['Landsize','BuildingArea']
num_attribs1 = ['Distance', 'Bedroom2', 'Bathroom', 'Car', 
'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
cat_attribs = ["CouncilArea",'Type','Suburb','Postcode']

#Creamos la "full_pipeline", es decir la pipeline que engloba a todas las otras.
full_pipeline_rf = ColumnTransformer([
        ("num0", num0_pipeline, num_attribs0),
        ("num1", num1_pipeline, num_attribs1),
        ("cat", cat_pipeline, cat_attribs),
])

#Preprocesamos los datos utilizando la "full_pipeline"
X_train_prepared = full_pipeline_rf.fit_transform(X_train_rf,y_train_rf)
print("Prepared: ",X_train_prepared.shape)

Prepared:  (4345, 516)


In [47]:
rf_param_grid = {
                'max_features': range(10,120,50),
                'n_estimators': [82],
                    'max_depth': [None],
                   #'bootstrap': [False],
                #"min_samples_split":[2,3],
                #"min_samples_leaf":[1,2],
}
apply_gridsearch(model=RandomForestRegressor(random_state=100),
                 params=rf_param_grid,
                 X_train_gs=X_train_prepared,
                 y_train_gs=y_train_rf)

Best trained model:
RandomForestRegressor(max_features=110, n_estimators=82, random_state=100)
Best parameters:
{'max_depth': None, 'max_features': 110, 'n_estimators': 82}
Best Score
550.1148792382141 302626.380359275


In [48]:
model = RandomForestRegressor(random_state=100)
rf_param_grid = {
                'max_features': range(10,120,50),
                'n_estimators': [82],
                    'max_depth': [None],
                   #'bootstrap': [False],
                #"min_samples_split":[2,3],
                #"min_samples_leaf":[1,2],
}
rf_grid_search = GridSearchCV(model, param_grid=rf_param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True,n_jobs=-1)
rf_grid_search.fit(X_train_prepared, y_train)

print("Best trained model:")
print(rf_grid_search.best_estimator_)
print("Best parameters:")
print(rf_grid_search.best_params_)
print("Best Score")
print(np.sqrt(-rf_grid_search.best_score_),-rf_grid_search.best_score_)

Best trained model:
RandomForestRegressor(max_features=110, n_estimators=82, random_state=100)
Best parameters:
{'max_depth': None, 'max_features': 110, 'n_estimators': 82}
Best Score
550.1148792382141 302626.380359275
