In [105]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, QuantileRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import joblib  # Para guardar los modelos
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [106]:
df = pd.read_csv(r'D:\Tesis\bases_datos\db_real_estate_cleaned_v3.csv')
vars = ["dias_desde_fecha","superficie_total","attribute_cocheras","attribute_cantidad_de_pisos","attribute_baños","attribute_ambientes","attribute_ambientes"]


scoring = 'neg_mean_squared_error' ## QUE VERGA ES ESTO

In [107]:

def calcula_metricas(y_true, y_pred):
    """
    Calcula diversas métricas de evaluación para un modelo de regresión.
    
    Parámetros:
    - y_true: Valores reales.
    - y_pred: Valores predichos por el modelo.
    
    Retorna:
    - metrics: Diccionario con las métricas calculadas.
    """
    # Calcular métricas estándar
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # Calcular el Error Relativo Promedio
    with np.errstate(divide='ignore', invalid='ignore'):
        error_relativo = (y_pred - y_true) / y_true
        error_relativo = np.where(y_true == 0, np.nan, error_relativo)  # Asignar NaN donde Precio Real es 0
    
    # Calcular el promedio del Error Relativo, ignorando NaN
    error_relativo_promedio = np.nanmean(error_relativo)
    
    # Agregar todas las métricas a un diccionario
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Error Relativo Promedio': error_relativo_promedio
    }
    
    return metrics

In [108]:
def guardar_resultados(model_name, params, metrics, iteration, output_dir):
    # Crear directorio si no existe
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Guardar parámetros y métricas en un archivo CSV
    df_resultados = pd.DataFrame([{'Modelo': model_name, 'Iteración': iteration, **params, **metrics}])
    resultados_path = os.path.join(output_dir, f'resultados_{model_name}.csv')
    if not os.path.exists(resultados_path):
        df_resultados.to_csv(resultados_path, index=False)
    else:
        df_resultados.to_csv(resultados_path, mode='a', header=False, index=False)
    
    # Guardar el modelo
    model_path = os.path.join(output_dir, f'{model_name}_iter_{iteration}.joblib')
    joblib.dump(params['model_instance'], model_path)


In [109]:
def obtener_conjunto_1(df, target):
    conjunto_1 = df.columns.tolist()
    conjunto_1.remove(target)  # Asumiendo que 'price' es la variable objetivo
    return conjunto_1


## QUE MIERDA ES ESTO ?

In [110]:
def obtener_conjuntos_nulos(df, target, thresholds=[0.05, 0.10]):
    conjuntos_nulos = {}
    for thresh in thresholds:
        missing = df.isnull().mean()
        vars_incluidas = missing[missing < thresh].index.tolist()
        if target in vars_incluidas:
            vars_incluidas.remove(target)
        conjuntos_nulos[f'conjunto_nulos_{int(thresh*100)}'] = vars_incluidas
    return conjuntos_nulos


In [111]:
def obtener_conjunto_3():
    # Reemplaza esta lista con las variables relevantes según la literatura
    conjunto_3 = ['superficie_total', 'attribute_baños', 'attribute_ambientes', 'location_state_name']
    return conjunto_3


In [112]:
def obtener_conjunto_4(df, target):
    corr_matrix = df.corr()
    top_vars = corr_matrix[target].abs().sort_values(ascending=False).index[1:6].tolist()
    return top_vars


In [113]:
def preprocesar_datos(X):
    # Identificar variables numéricas y categóricas
    num_vars = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_vars = X.select_dtypes(include=['object', 'category']).columns.tolist()
    # Definir transformaciones
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_vars),
            ('cat', categorical_transformer, cat_vars)
        ])
    # Aplicar transformaciones
    X_processed = preprocessor.fit_transform(X)
    return X_processed, preprocessor

In [114]:
def modelo_lasso(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = Lasso(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('Lasso', {'alpha': params.get('alpha'), 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_lasso(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    # Manejo de nulos
    X = X.dropna()
    y = y.loc[X.index]
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {'alpha': np.logspace(-4, 0, 50)}
    lasso = Lasso()
    random_search = RandomizedSearchCV(lasso, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model_params = {'alpha': params['alpha']}
        model, metrics = modelo_lasso(X_train, y_train, X_test, y_test, model_params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('Lasso_Best', {'alpha': best_params['alpha'], 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics


In [115]:
def modelo_random_forest(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('RandomForest', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_random_forest(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    param_distributions = {
        'n_estimators': [int(x) for x in np.linspace(100, 1000, 10)],
        'max_depth': [int(x) for x in np.linspace(10, 100, 10)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    rf = RandomForestRegressor()
    random_search = RandomizedSearchCV(rf, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_random_forest(X_train, y_train, X_test, y_test, params, i, output_dir)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('RandomForest_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics


In [116]:
def modelo_regresion_lineal(X_train, y_train, X_test, y_test, iteration, output_dir):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    guardar_resultados('LinearRegression', {'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_regresion_lineal(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train_processed, preprocessor = preprocesar_datos(X_train)
    X_test_processed = preprocessor.transform(X_test)
    model, metrics = modelo_regresion_lineal(X_train_processed, y_train, X_test_processed, y_test, 0, output_dir)
    return model, metrics




In [117]:
def modelo_decision_tree(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = DecisionTreeRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('DecisionTree', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_decision_tree(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'max_depth': [int(x) for x in np.linspace(5, 50, 10)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2', None]
    }
    dt = DecisionTreeRegressor()
    random_search = RandomizedSearchCV(dt, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_decision_tree(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('DecisionTree_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics




In [118]:
def modelo_bagging(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = BaggingRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('Bagging', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_bagging(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'n_estimators': [10, 50, 100, 200],
        'max_samples': [0.5, 0.7, 1.0],
        'max_features': [0.5, 0.7, 1.0],
        'bootstrap': [True, False],
        'bootstrap_features': [True, False]
    }
    bagging = BaggingRegressor()
    random_search = RandomizedSearchCV(bagging, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_bagging(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('Bagging_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics



In [119]:
def modelo_xgboost(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = XGBRegressor(**params, objective='reg:squarederror')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('XGBoost', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_xgboost(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0]
    }
    xgb = XGBRegressor(objective='reg:squarederror')
    random_search = RandomizedSearchCV(xgb, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_xgboost(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('XGBoost_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics



In [120]:
def modelo_lightgbm(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('LightGBM', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_lightgbm(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'n_estimators': [100, 200, 500],
        'max_depth': [-1, 5, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [31, 50, 100],
        'subsample': [0.7, 0.8, 1.0]
    }
    lgbm = LGBMRegressor()
    random_search = RandomizedSearchCV(lgbm, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_lightgbm(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('LightGBM_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics




In [121]:
def modelo_mlp(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = MLPRegressor(**params, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('MLPRegressor', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics
def correr_mlp(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'hidden_layer_sizes': [(50,), (100,), (50,50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    }
    mlp = MLPRegressor(max_iter=1000)
    random_search = RandomizedSearchCV(mlp, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_mlp(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('MLPRegressor_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics



In [122]:
def modelo_knn(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = KNeighborsRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('KNN', {**params, 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics

def correr_knn(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]  # Distancia Manhattan (p=1) y Euclídea (p=2)
    }
    knn = KNeighborsRegressor()
    random_search = RandomizedSearchCV(knn, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_knn(X_train, y_train, X_test, y_test, params, i, output_dir)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('KNN_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics



In [123]:
def modelo_xgboost_quantile(X_train, y_train, X_test, y_test, params, iteration, output_dir, quantile):
    # Definir la función de pérdida cuantil
    def quantile_loss(y_true, y_pred):
        errors = y_true - y_pred
        return np.mean(np.maximum(quantile * errors, (quantile - 1) * errors))
    
    model = XGBRegressor(**params, objective='reg:squarederror')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {'QuantileLoss': quantile_loss(y_test, y_pred)}
    # Guardar resultados
    guardar_resultados('XGBoostQuantile', {**params, 'model_instance': model, 'quantile': quantile}, metrics, iteration, output_dir)
    return model, metrics
def correr_xgboost_quantile(df, vars, target, output_dir, quantile=0.5):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    xgb = XGBRegressor(objective='reg:squarederror')
    random_search = RandomizedSearchCV(xgb, param_distributions, n_iter=5, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_xgboost_quantile(X_train, y_train, X_test, y_test, params, i, output_dir, quantile)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    metrics = {'QuantileLoss': None}  # Puedes calcular la pérdida cuantil para el mejor modelo si lo deseas
    guardar_resultados('XGBoostQuantile_Best', {**best_model.get_params(), 'model_instance': best_model, 'quantile': quantile}, metrics, 'best', output_dir)
    return best_model, metrics






In [124]:
def modelo_lasso_quantile(X_train, y_train, X_test, y_test, params, iteration, output_dir, quantile):
    model = QuantileRegressor(quantile=quantile, **params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('LassoQuantile', {**params, 'model_instance': model, 'quantile': quantile}, metrics, iteration, output_dir)
    return model, metrics


In [125]:
def correr_lasso_quantile(df, vars, target, output_dir, quantile=0.5):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    # Preprocesamiento
    X_processed, preprocessor = preprocesar_datos(X)
    # División de datos
    X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2)
    # Grilla de hiperparámetros
    param_distributions = {
        'alpha': np.logspace(-4, 0, 10),
        'solver': ['highs']
    }
    quantile_reg = QuantileRegressor(quantile=quantile)
    random_search = RandomizedSearchCV(quantile_reg, param_distributions, n_iter=5, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    # Iterar sobre los resultados y guardar
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_lasso_quantile(X_train, y_train, X_test, y_test, params, i, output_dir, quantile)
    # Guardar el mejor modelo
    best_model = random_search.best_estimator_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('LassoQuantile_Best', {**best_model.get_params(), 'model_instance': best_model, 'quantile': quantile}, metrics, 'best', output_dir)
    return best_model, metrics




In [126]:
def modelo_lasso(X_train, y_train, X_test, y_test, params, iteration, output_dir):
    model = Lasso(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = calcula_metricas(y_test, y_pred)
    # Guardar resultados
    guardar_resultados('Lasso', {'alpha': params.get('alpha'), 'model_instance': model}, metrics, iteration, output_dir)
    return model, metrics


In [127]:
def correr_random_forest(df, vars, target, output_dir):
    X = df[vars]
    y = df[target]
    X = X.dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    param_distributions = {
        'n_estimators': [int(x) for x in np.linspace(100, 1000, 10)],
        'max_depth': [int(x) for x in np.linspace(10, 100, 10)],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    rf = RandomForestRegressor()
    random_search = RandomizedSearchCV(rf, param_distributions, n_iter=10, scoring='neg_mean_squared_error', cv=3, random_state=42)
    random_search.fit(X_train, y_train)
    for i, params in enumerate(random_search.cv_results_['params']):
        model, metrics = modelo_random_forest(X_train, y_train, X_test, y_test, params, i, output_dir)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    metrics = calcula_metricas(y_test, best_model.predict(X_test))
    guardar_resultados('RandomForest_Best', {**best_params, 'model_instance': best_model}, metrics, 'best', output_dir)
    return best_model, metrics




In [128]:
## FALTA LOCATION NORTE
vars = ["dias_desde_fecha","superficie_total","attribute_baños","attribute_ambientes",
'location_state_name_Capital Federal',
'location_state_name_Bs.As. G.B.A. Oeste',
 'location_state_name_Bs.As. G.B.A. Sur',
 'attribute_tipo_de_casa_Casa',
 'attribute_property_type_Ph',
 'attribute_property_type_Departamento']
 
target = "ln_precio_USD"

In [129]:
correr_lasso(df, vars, target, output_dir)

(Lasso(alpha=0.0011513953993264468),
 {'MSE': 0.2412468168649638,
  'RMSE': 0.4911688272528742,
  'MAE': 0.3557679381164288,
  'R2': 0.46900223967567123,
  'Error Relativo Promedio': 0.0019623335852682463})

In [130]:
correr_decision_tree(df, vars, target, output_dir)


(DecisionTreeRegressor(max_depth=10, max_features='auto', min_samples_leaf=4),
 {'MSE': 0.22424724725523532,
  'RMSE': 0.4735475131971821,
  'MAE': 0.34545473918003533,
  'R2': 0.5070754008699825,
  'Error Relativo Promedio': 0.0018114465220828127})

In [131]:
correr_bagging(df, vars, target, output_dir)

(BaggingRegressor(bootstrap=False, bootstrap_features=True, max_samples=0.5,
                  n_estimators=50),
 {'MSE': 0.23200578123250964,
  'RMSE': 0.4816697844296543,
  'MAE': 0.3484911015531434,
  'R2': 0.5052175117284357,
  'Error Relativo Promedio': 0.0018796790288516411})

In [132]:
correr_xgboost(df, vars, target, output_dir)

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 {'MSE': 0.20968961519951182,
  'RMSE': 0.4579187866854906,
  'MAE': 0.3368795185609722,
  'R2': 0.5357927179997601,
  'Error Relativo Promedio': 0.00137756438255987})

In [133]:
correr_random_forest(df, vars, target, output_dir)

(RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=5),
 {'MSE': 0.1688420476728368,
  'RMSE': 0.41090393971442624,
  'MAE': 0.2883517057128061,
  'R2': 0.6212007752950978,
  'Error Relativo Promedio': 0.0015793306579063777})

In [134]:
correr_regresion_lineal(df, vars, target, output_dir)

(LinearRegression(),
 {'MSE': 0.2767449110478565,
  'RMSE': 0.5260655007200686,
  'MAE': 0.3931994659707707,
  'R2': 0.3894320143356691,
  'Error Relativo Promedio': 0.002118090961577511})

In [135]:
correr_lightgbm(df, vars, target, output_dir)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001492 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 40170, number of used features: 4
[LightGBM] [Info] Start training from score 11.781989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 520
[LightGBM] [Info] Number of data points in the train set: 40171, number of used features: 4
[LightGBM] [Info] Start training from score 11.786142
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins

(LGBMRegressor(learning_rate=0.05, max_depth=10, n_estimators=200,
               num_leaves=100),
 {'MSE': 0.2000551587167106,
  'RMSE': 0.44727526056860173,
  'MAE': 0.3328257551543416,
  'R2': 0.5414767989689134,
  'Error Relativo Promedio': 0.0010673049116302715})

In [136]:

correr_mlp(df, vars, target, output_dir)

(MLPRegressor(alpha=0.001, learning_rate='adaptive', max_iter=1000),
 {'MSE': 0.2772297368426119,
  'RMSE': 0.526526102717246,
  'MAE': 0.39026582920220165,
  'R2': 0.3971283369894928,
  'Error Relativo Promedio': 0.005312432114886809})

In [137]:

correr_knn(df, vars, target, output_dir)


(KNeighborsRegressor(n_neighbors=9),
 {'MSE': 0.25328899613016304,
  'RMSE': 0.5032782492122653,
  'MAE': 0.35808809897489974,
  'R2': 0.45930787219948654,
  'Error Relativo Promedio': 0.001348921023848349})

In [138]:
correr_xgboost_quantile(df, vars, target, output_dir, quantile=0.5)

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 {'QuantileLoss': None})

In [139]:

correr_lasso_quantile(df, vars, target, output_dir, quantile=0.5)

15 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucki\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucki\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_quantile.py", line 237, in fit
    A_eq = np.concatenate(
  File "<__array_function__ internals>", line 200, in concatenate
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 24.0 GiB for an array with shape (40170, 80350) and data type float64

----------------------------

MemoryError: Unable to allocate 27.1 GiB for an array with shape (60256, 60256) and data type float64

In [73]:

correr_random_forest(df, vars, target, output_dir)

KeyboardInterrupt: 