## Configuración del Notebook

In [None]:
#!pip install xgboost

In [9]:
# Importar librerias necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

## Lectura de datos y separación de datasets de entrenamiento y validación

In [7]:
# Lectura de los datos
df= pd.read_csv("https://raw.githubusercontent.com/mlondono-oc/LEA2_20232/main/Modulo2/data/auto-mpg.csv", na_values = '?')

# copia de df
df1 = df
df1['country_code'] = df1.origin.replace([1,2,3],['USA','Europe','Japan'])
df1 = df1.rename(columns = {'mpg' : 'target', "model year": "model_year"})
df1.drop(["car name", "origin"], axis = 1, inplace = True)

# Para cambiar el tipo de dato puede utilizar la función astype de pandas
df1.cylinders = df1.cylinders.astype(str)
df1.model_year = df1.model_year.astype(str)
df1.weight = df1.weight.astype(float)

# Separación de caracteristicas y target (X , y)
y = df1.target
X = df1.drop(["target"], axis = 1)

# Separación en conjuntos de entrenamiento y validación con 90% de muestras para entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("Tamaño del conjunto de entrenamiento:", X_train.shape)
print("Tamaño del conjunto de validación:",  X_test.shape)

#Separación de caracteristicas númericas y categóricas
numeric_columns=list(X.select_dtypes('float64').columns)
categorical_columns=list(X.select_dtypes('object').columns)

Tamaño del conjunto de entrenamiento: (358, 7)
Tamaño del conjunto de validación: (40, 7)


## Modelos Base

### Linear Regression

In [None]:
# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")),
                                      ("scaler", StandardScaler())])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Pipeline de Entrenamiento Ridge
pipeline = Pipeline([('transformer', transformer),
                     ('Ridge Model', LinearRegression())])

scores_ridge = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='neg_mean_squared_error')
print('neg_mean_squared_error: %.3f' % (scores_ridge.mean()))

In [None]:
scores_ridge = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='r2')
print('Mean r2: %.3f' % (scores_ridge.mean()))

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])


#Definición del modelo
ranfor = RandomForestRegressor(n_estimators = 200,
                               criterion    = 'squared_error',
                               n_jobs       = -1,
                               random_state = 42)

#Pipeline de Entrenamiento Ridge
pipeline = Pipeline([('transformer', transformer),
                     ('Random Forest Model', ranfor)])

scores_ranfor = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='r2')
print('Mean R2: %.3f' % (scores_ranfor.mean()))

In [None]:
ranfor.get_params()

### XGB Regressor

In [None]:
from xgboost import XGBRegressor

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(n_estimators=500,
                        eta=0.1,
                        colsample_bytree=0.9,
                        random_state=42
                        )

#Pipeline de Entrenamiento Ridge
pipeline = Pipeline([('transformer', transformer),
                     ('XGBoost', XGBmodel)])

scores_xgb = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='r2')
print('Mean R2: %.3f' % (scores_xgb.mean()))

## Optimización de Hiperparámetros: Random Forest Regressor

In [17]:
# Definición de cuadricula de hiperparametros
parameters = {'max_depth': [3,5,7,9,12],
              'max_features': [0.8,0.9,1],
              'max_leaf_nodes': [3,6,9,10],
              'min_samples_leaf': [5,7,10,12],
              'n_estimators': [200,500,800]}

### Búsqueda en cuadrícula

In [18]:
from sklearn.model_selection import GridSearchCV

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
ranfor = RandomForestRegressor(random_state = 42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.transform(X_test)

grid_search = GridSearchCV(ranfor, parameters, cv=5, scoring='r2', n_jobs=-1)
grid_result = grid_search.fit(X_train_std, y_train)

print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

### Búsqueda aleatoria

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
ranfor = RandomForestRegressor(random_state = 42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.transform(X_test) # Aqui no se debe volver a ajustar, solo transformar

rand_s = RandomizedSearchCV(estimator=ranfor, param_distributions=parameters, n_iter=10, scoring='r2', cv=5, verbose=False)
rand_s.fit(X_train_std, y_train)

print('Best Params: ', rand_s.best_params_)
print('Best Score: ', rand_s.best_score_)

In [None]:
#Tamaño de entrenamiento
print(X_train_std.shape)
# Tamano de validación
print(X_test_std.shape)

In [None]:
rand_s.best_estimator_.predict(X_test_std)

In [None]:
print("R2:{:.2f}".format(rand_s.best_estimator_.score(X_test_std,y_test)))

## Optimización de Hiperparámetros: XGBoosting

In [None]:
# Definición de cuadricula de hiperparametros
parameters = {'eta': [0.01,0.1,0.3],
              'colsample_bytree': [0.8,0.9,1],
              'max_depth': [4,6,8,10,12],
              'n_estimators': [200,500,800]}

### Búsqueda en cuadrícula

In [None]:
from sklearn.model_selection import GridSearchCV
# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(random_state=42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.fit_transform(X_test)

grid_search = GridSearchCV(XGBmodel, parameters, cv=5, scoring='r2', n_jobs=-1)
grid_result = grid_search.fit(X_train_std, y_train)

print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

### Búsqueda Aleatoria

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(random_state=42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.fit_transform(X_test)

rand_s = RandomizedSearchCV(estimator=XGBmodel, param_distributions=parameters, n_iter=10, scoring='r2', cv=5, verbose=True)

rand_s.fit(X_train_std, y_train)

print('Best Params: ', rand_s.best_params_)
print('Best Score: ', rand_s.best_score_)

In [None]:
bestModel_xgb=rand_s.best_estimator_
print("R2:{:.2f}".format(bestModel_xgb.score(X_test_std,y_test)))