In [94]:
# Imports

import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# imputation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [95]:
X = pd.read_csv("data/train.csv", index_col="Id")
X_test  = pd.read_csv("data/test.csv" , index_col="Id")

# Removing rows with NaN for target
X.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = X.SalePrice
X.drop(["SalePrice"], axis=1, inplace=True)

# train test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Selecting columns - getting rid of high cardinality
cat_cols = [col for col in X.columns if X[col].nunique() < 10 and X[col].dtype == "object"]

# numeric cols
num_cols = [col for col in X.columns if X[col].dtype in ["int64", "float64"]]

my_cols = cat_cols + num_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
X_test  = X_test[my_cols].copy()


In [96]:
# Pipelines

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="mean")

# Prepcoressing for categorical data
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
    
# Bundle preprocessing for numerical and categorical data
preprocessor= ColumnTransformer(transformers=[
    ("num", numerical_transformer, num_cols),
    ("cat", categorical_transformer, low_cardinality)
])

#define model
model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4, random_state=0)


In [97]:
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


In [99]:
preprocessor.fit_transform(X_train)

pipe.fit(X_train, y_train, 
             model__early_stopping_rounds=100, 
             model__eval_set=[(preprocessor.transform(X_valid), y_valid)], 
             model__verbose=False)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond',

In [100]:
preds = pipe.predict(X_valid)

In [101]:
print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 16196.576011344177


In [52]:
# need to adjust X_valid with preprocessing procedure first

#preprocessor.fit_transform(X_train)
#preprocessor.transform(X_valid)

In [102]:
test_preds = pipe.predict(X_test)

In [103]:
output = pd.DataFrame({"Id":X_test.index,
                        "SalePrice":test_preds})
output.to_csv("output/submission.csv", index=False)

## CV

In [60]:
scores = -1 * cross_val_score(pipe, X_train, y_train,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [14559.01650975 18258.70242388 16131.61747129 16383.23274879
 13260.11643307]


In [61]:
scores.mean()

15718.537117356294

## Grid Search

In [86]:
params = {
    "n_estimators":[500,750, 1000,1500],
    "learning_rate":[0.045, 0.0475, 0.05, 0.055]
}

In [87]:
grid = GridSearchCV(model, params, scoring="neg_mean_absolute_error")

In [88]:
grid.fit(preprocessor.transform(X_train), y_train,
             early_stopping_rounds=100, 
             eval_set=[(preprocessor.transform(X_valid), y_valid)], 
             verbose=False)




GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.05,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=1000, n_jobs=4, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.045, 0.0475, 0.05, 0.055],
                         'n_estimators': [500, 750, 1000, 1500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=F

In [89]:
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
#print(grid.best_estimator_.alpha)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bytree=1,
                                    gamma=0, learning_rate=0.05,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=1000, n_jobs=4, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=True,
                                    subsample=1),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.045, 0.0475, 0.05, 0.055],
                         'n_estimators': [500, 750, 1000, 1500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=F

In [90]:
grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=500,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=True, subsample=1)