In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('./Datasets/train.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_full = X_full[my_cols].copy()

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', XGBRegressor())
                     ])

In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [n for n in range(600, 801, 50)],
    'model__learning_rate': [n/100 for n in range(2, 4, 1)],
    'model__max_depth': [n for n in range(4, 9, 1)]
}
search = GridSearchCV(pipeline, param_grid, n_jobs=-1,cv=5, verbose=5, scoring='neg_mean_absolute_error')
search.fit(X_full, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 156 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 21.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [4]:
search.best_params_

{'model__learning_rate': 0.02,
 'model__max_depth': 5,
 'model__n_estimators': 800}

In [5]:
search.cv_results_

{'mean_fit_time': array([ 9.54600215,  9.09366999, 10.39464765, 10.59724531, 11.40576224,
        10.78263588, 12.28342552, 12.11527057, 12.38713498, 13.95015678,
        12.71710949, 13.38177662, 14.74651303, 15.6702858 , 16.60616803,
        14.84871101, 15.80689907, 16.73721037, 21.5831285 , 23.31616507,
        18.98895559, 17.77208614, 20.47185841, 21.88925614, 22.37494521,
         8.50333562,  9.30545616, 11.03825603, 12.56160212, 13.14153819,
        12.20135193, 12.76904497, 14.01502318, 12.76861763, 13.63423514,
        12.37428322, 13.75575719, 14.92381406, 18.19688764, 16.50463071,
        14.41460671, 15.65040712, 18.71672201, 18.91659069, 19.63396511,
        16.95911798, 18.87959676, 19.16576767, 21.17288642, 20.80561461]),
 'std_fit_time': array([0.52275293, 0.77985429, 0.26106664, 0.39370469, 0.15641172,
        0.42774395, 0.79508657, 0.31290776, 0.56840529, 0.44513457,
        0.41903842, 0.43672093, 0.36031343, 0.43473886, 0.46784913,
        0.38840285, 0.4239354 ,

In [6]:
search.best_estimator_

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [7]:
search.best_score_

-16028.680535637843