## Configuración del Notebook

In [None]:
#!pip install xgboost

In [1]:
# Importar librerias necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

## Lectura de datos y separación de datasets de entrenamiento y validación

In [2]:
# Lectura de los datos
df= pd.read_csv("https://raw.githubusercontent.com/mlondono-oc/LEA2_20232/main/Modulo2/data/auto-mpg.csv", na_values = '?')

# copia de df
df1 = df
df1['country_code'] = df1.origin.replace([1,2,3],['USA','Europe','Japan'])
df1 = df1.rename(columns = {'mpg' : 'target', "model year": "model_year"})
df1.drop(["car name", "origin"], axis = 1, inplace = True)

# Para cambiar el tipo de dato puede utilizar la función astype de pandas
df1.cylinders = df1.cylinders.astype(str)
df1.model_year = df1.model_year.astype(str)
df1.weight = df1.weight.astype(float)

# Separación de caracteristicas y target (X , y)
y = df1.target
X = df1.drop(["target"], axis = 1)

# Separación en conjuntos de entrenamiento y validación con 90% de muestras para entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("Tamaño del conjunto de entrenamiento:", X_train.shape)
print("Tamaño del conjunto de validación:",  X_test.shape)

#Separación de caracteristicas númericas y categóricas
numeric_columns=list(X.select_dtypes('float64').columns)
categorical_columns=list(X.select_dtypes('object').columns)

Tamaño del conjunto de entrenamiento: (358, 7)
Tamaño del conjunto de validación: (40, 7)


In [3]:
X.head(3)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,country_code
0,8,307.0,130.0,3504.0,12.0,70,USA
1,8,350.0,165.0,3693.0,11.5,70,USA
2,8,318.0,150.0,3436.0,11.0,70,USA


In [4]:
print(numeric_columns)
print(categorical_columns)

['displacement', 'horsepower', 'weight', 'acceleration']
['cylinders', 'model_year', 'country_code']


## Modelos Base

### Linear Regression

In [5]:
# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")),
                                      ("scaler", StandardScaler())])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Pipeline de Entrenamiento Ridge
pipeline = Pipeline([('transformer', transformer),
                     ('Linear Regression Model', LinearRegression())])

In [6]:
# Entrenamiento
scores_linearRegression = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='neg_mean_squared_error')
print('neg_mean_squared_error: %.3f' % (scores_linearRegression.mean()))

neg_mean_squared_error: -8.958


In [7]:
scores_linearRegression = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='r2')
print('Mean r2: %.3f' % (scores_linearRegression.mean()))

Mean r2: 0.854


In [8]:
scores_linearRegression

array([0.88566076, 0.88778739, 0.81019133, 0.84144637, 0.84266479])

### Random Forest Regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])


#Definición del modelo
ranfor = RandomForestRegressor(n_estimators = 200,
                               criterion    = 'squared_error',
                               n_jobs       = -1,
                               random_state = 42)

#Pipeline de Entrenamiento Random Forest
pipeline_ranfor = Pipeline([('transformer', transformer),
                     ('Random Forest Model', ranfor)])

In [10]:
# Entrenamiento
scores_ranfor = cross_val_score(pipeline_ranfor, X_train, y_train, cv = 5, scoring='r2')
print('Mean R2: %.3f' % (scores_ranfor.mean()))

Mean R2: 0.820


In [11]:
ranfor.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### XGB Regressor

In [12]:
from xgboost import XGBRegressor

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(n_estimators=200,
                        random_state=42
                        )

#Pipeline de Entrenamiento Ridge
pipeline_xgboost = Pipeline([('transformer', transformer),
                     ('XGBoost', XGBmodel)])

In [13]:
# Entrenamiento
scores_xgb = cross_val_score(pipeline_xgboost, X_train, y_train, cv = 5, scoring='r2')
print('Mean R2: %.3f' % (scores_xgb.mean()))

Mean R2: 0.828


## Optimización de Hiperparámetros: Random Forest Regressor

In [14]:
# Definición de cuadricula de hiperparametros
parameters = {'max_depth': [5,7,9,12],
              'max_features': [0.7,0.8,0.9,1],
              'max_leaf_nodes': [9,11,13],
              'min_samples_leaf': [5,7,10,12],
              'n_estimators': [300,500,800]}

### Búsqueda en cuadrícula

In [15]:
from sklearn.model_selection import GridSearchCV

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
ranfor = RandomForestRegressor(random_state = 42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.transform(X_test)

grid_search = GridSearchCV(ranfor, parameters, cv=5, scoring='r2', n_jobs=-1)

# Iniciamos la busqueda
grid_result = grid_search.fit(X_train_std, y_train)

In [16]:
print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

Best Params:  {'max_depth': 7, 'max_features': 0.7, 'max_leaf_nodes': 13, 'min_samples_leaf': 5, 'n_estimators': 800}
Best Score:  0.775305542538872


In [None]:
#Definición del modelo optimo
ranfor_optimo = RandomForestRegressor(n_estimators = 200,
                               criterion    = 'squared_error',
                               n_jobs       = -1,
                               random_state = 42)

### Búsqueda aleatoria

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
ranfor = RandomForestRegressor(random_state = 42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.transform(X_test) # Aqui no se debe volver a ajustar, solo transformar

rand_s = RandomizedSearchCV(estimator=ranfor, param_distributions=parameters, n_iter=10, scoring='r2', cv=5, verbose=False)
rand_s.fit(X_train_std, y_train)

In [19]:
print('Best Params: ', rand_s.best_params_)
print('Best Score: ', rand_s.best_score_)

Best Params:  {'n_estimators': 500, 'min_samples_leaf': 5, 'max_leaf_nodes': 13, 'max_features': 0.9, 'max_depth': 5}
Best Score:  0.7733319450446305


In [20]:
y_pred = rand_s.best_estimator_.predict(X_test_std)

In [21]:
rand_s.best_estimator_

In [22]:
y_pred

array([33.86334278, 28.20626873, 21.30005099, 14.82583073, 14.6051219 ,
       27.65554022, 26.04647295, 13.26094964, 18.3157332 , 20.90113631,
       14.51241353, 33.90088841, 32.36819649, 14.84444042, 26.6926556 ,
       13.44557715, 28.2350952 , 20.87287544, 14.37133128, 34.07258798,
       26.53882169, 21.17086064, 20.85981269, 28.64876803, 16.18771445,
       30.35281162, 24.55601127, 25.11385508, 18.43337562, 13.58637121,
       26.66536008, 30.47827458, 16.76729396, 25.0188573 , 38.08749159,
       14.12314693, 22.88222784, 15.79128976, 13.74132868, 26.80009736])

In [23]:
print("R2:{:.2f}".format(rand_s.best_estimator_.score(X_test_std,y_test)))

R2:0.81


## Optimización de Hiperparámetros: XGBoosting

In [None]:
# Definición de cuadricula de hiperparametros
parameters = {'eta': [0.01,0.1,0.3],
              'colsample_bytree': [0.8,0.9,1],
              'max_depth': [4,6,8,10,12],
              'n_estimators': [200,500,800]}

### Búsqueda en cuadrícula

In [None]:
from sklearn.model_selection import GridSearchCV
# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(random_state=42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.fit_transform(X_test)

grid_search = GridSearchCV(XGBmodel, parameters, cv=5, scoring='r2', n_jobs=-1)
grid_result = grid_search.fit(X_train_std, y_train)

print('Best Params: ', grid_result.best_params_)
print('Best Score: ', grid_result.best_score_)

### Búsqueda Aleatoria

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Transformador para variables numericas
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])

#Pipeline de Procesamiento
transformer = ColumnTransformer([('num',numeric_transformer, numeric_columns),
                                 ('cat', OneHotEncoder(), categorical_columns)])

#Definición del modelo
XGBmodel = XGBRegressor(random_state=42)

# Transformación conjunto de train y test
X_train_std = transformer.fit_transform(X_train)
X_test_std = transformer.fit_transform(X_test)

rand_s = RandomizedSearchCV(estimator=XGBmodel, param_distributions=parameters, n_iter=10, scoring='r2', cv=5, verbose=True)

rand_s.fit(X_train_std, y_train)

print('Best Params: ', rand_s.best_params_)
print('Best Score: ', rand_s.best_score_)

In [None]:
bestModel_xgb=rand_s.best_estimator_
print("R2:{:.2f}".format(bestModel_xgb.score(X_test_std,y_test)))