In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('./Datasets/train.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_full = X_full[my_cols].copy()

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', XGBRegressor())
                     ])

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [n for n in range(400, 1001, 50)],
    'model__learning_rate': [n/100 for n in range(2, 5, 1)]
}
search = GridSearchCV(pipeline, param_grid, n_jobs=-1,cv=7, verbose=2)
search.fit(X_full, y)

Fitting 7 folds for each of 39 candidates, totalling 273 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 273 out of 273 | elapsed: 19.4min finished


GridSearchCV(cv=7, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [11]:
search.best_params_

{'model__learning_rate': 0.04, 'model__n_estimators': 1000}

In [12]:
search.cv_results_

{'mean_fit_time': array([ 7.32351024,  7.98801715,  8.94199572,  9.25290799, 10.25909485,
        12.56644542, 14.58284811, 13.87732921, 14.61596789, 15.26475361,
        16.75569585, 16.56307847, 18.16627506,  7.5458888 ,  8.13234261,
         9.49530738,  9.52915444, 10.62510463, 11.13904684, 12.44796208,
        13.1042944 , 14.4811151 , 14.56034279, 15.43417832, 16.28448994,
        17.17246873,  6.8525548 ,  7.73424026,  8.59146738,  9.29314375,
        10.53104098, 14.30283557, 13.56407155, 13.85707549, 15.52004892,
        14.73702989, 16.22246987, 18.27219592, 17.27668609]),
 'std_fit_time': array([0.45575393, 0.1381368 , 0.15232555, 0.16533083, 0.1700926 ,
        0.57392758, 0.93938908, 0.92842439, 0.6791282 , 0.76141024,
        0.83836023, 1.02483364, 0.4915489 , 0.37008767, 0.44181637,
        0.88460791, 0.22183332, 0.36661388, 0.14010414, 0.36283167,
        0.29593861, 0.42687367, 0.41286219, 0.47283973, 0.44370605,
        0.27111554, 0.31258439, 0.29951511, 0.18217493

In [13]:
search.best_estimator_

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [14]:
search.best_score_

0.8708470485634422