## MODELADO PIPELINE

Comparacion de __RegressionLinear__, __RandomForest__ y __XGBoost__

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [None]:


# Pipeline para Linear Regression
lr_pipeline = Pipeline([
    ('Premodelado', preprocessing),   
    ('ModeloRegressor', LinearRegression())
])

# Pipeline para Random Forest
rf_pipeline = Pipeline([
    ('Premodelado', preprocessing),
    ('ModeloRegressor', RandomForestRegressor(random_state=42))
])

# Pipeline para XGBoost
xgb_pipeline = Pipeline([
    ('Premodelado', preprocessing),
    ('ModeloRegressor', xgb.XGBRegressor(random_state=42))
])


#se aplica la validacion cruzada a cada modelo 
#scoring utilizo la raiz del error cuadratico medio
for name, pipe in zip(["lr_pipeline","rf_pipeline", "xgb_pipeline"],[lr_pipeline, rf_pipeline, xgb_pipeline]):
    resultado = cross_val_score(pipe, train, y_train, cv = 5, scoring = "neg_root_mean_squared_error")
    print(f"{name}: {np.mean(resultado):.4f}")
    print(resultado)


    

In [None]:
cv=5 



# Linear Regression
lr_params = {
    'ModeloRegressor__fit_intercept': [True, False], # si tiene o no intercept
    'ModeloRegressor__normalize' :  [True, False]  # normaliza, aunque puede ser redundante si ya se han prpcesado correctamente los datos
}

lr_grid = GridSearchCV(
    lr_pipeline, 
    lr_params, 
    cv=cv, 
    scoring='neg_root_mean_square_error',
    verbose=1,
    n_jobs=-1
)

# Random Forest GridSearchCV
rf_params = {
    'ModeloRegressor__n_estimators': [50, 100, 200],
    'ModeloRegressor__max_depth': [None, 10, 20, 50],
    'ModeloRegressor__min_samples_split': [2, 5],
    # 'ModeloRegressor__bootstrap': [True, False] #`bootstrap`: Puede ser interesante probar tanto True como False  
}

rf_grid = GridSearchCV(
    rf_pipeline, 
    rf_params, 
    cv=cv, 
    scoring='neg_root_mean_square_error',
    verbose=1,
    n_jobs=-1
)



# XGBoost GridSearchCV
xgb_params = {
    'ModeloRegressor__n_estimators': [50, 100, 200],
    'ModeloRegressor__learning_rate': [0.05, 0.1],
    'ModeloRegressor__max_depth': [3, 5, 10],
    'ModeloRegressor__subsample': [0.8, 1.0]
}

xgb_grid = GridSearchCV(
    xgb_pipeline, 
    xgb_params, 
    cv=cv, 
    scoring='neg_root_mean_square_error',
    verbose=1,
    n_jobs=-1
)


pipe_grids = {"lr":lr_grid,
         "gs_rand_forest":rf_grid,
         "gs_xgb":xgb_grid}


In [None]:
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#entrenamos modelo
%%time

for nombre, grid_search in pipe_grids.items():
    grid_search.fit(train, y_train)

In [None]:
#df comparacin de modelos

best_grids = [(i, j.best_score_) for i, j in pipe_grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

In [None]:
#Mejor modelo
best_model = pipe_grids[best_grids.iloc[0,0]]
best_model

In [None]:
# Evaluar en conjunto de prueba
y_pred = best_model.predict(X_test)
    

In [None]:
# Calcular métricas
rmse = root_mean_squared_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)     # si no funciona el root
r2 = r2_score(y_test, y_pred)

In [None]:
#Guardar modelo

## Usando joblib:
joblib.dump(best_model, 'modelo_pipeline.joblib')