In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV



In [16]:
df= pd.read_csv('./data/train_preprocessed.csv')
scaler = StandardScaler()
X=scaler.fit_transform(df.drop('SalePrice', axis=1))
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice',axis =1), df['SalePrice'], test_size=0.2, random_state=0)

In [28]:
model = XGBRegressor(subsample= 0.6,n_estimators=500, min_child_weight= 1, max_depth= 4, learning_rate= 0.01, gamma=1.5, colsample_bytree= 0.8)


In [29]:
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [31]:
print(mean_squared_error(predictions, y_test))
print(r2_score(y_test, y_pred))

811826315.7773973
0.8824435472488403


In [32]:
cv = KFold(n_splits=5, random_state=1, shuffle=True)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')

In [33]:
print(scores)
scores.mean()

[0.93299788 0.77586806 0.89375132 0.88200676 0.88745505]


np.float64(0.8744158148765564)

In [4]:
params = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 500, 1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

model_test = XGBRegressor(random_state=0)

skf = KFold(n_splits=5, shuffle = True, random_state = 0)

folds = 3
param_comb = 7
random_search = RandomizedSearchCV(model_test, param_distributions=params, n_iter=param_comb, scoring='r2', n_jobs=-1, cv=skf, verbose=3, random_state=0 )
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=10, n_estimators=1000, subsample=0.6;, score=0.930 total time=   1.2s[CV 4/5] END colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=10, n_estimators=1000, subsample=0.6;, score=0.741 total time=   1.2s
[CV 5/5] END colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=10, n_estimators=1000, subsample=0.6;, score=0.896 total time=   1.2s

[CV 2/5] END colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=10, n_estimators=1000, subsample=0.6;, score=0.869 total time=   1.2s
[CV 3/5] END colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=3, min_child_weight=10, n_estimators=1000, subsample=0.6;, score=0.889 total time=   1.3s
[CV 3/5] END colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=1000, sub

In [6]:
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best hyperparameters:
{'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 1.5, 'colsample_bytree': 0.8}


In [12]:
params = {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [100, 500, 800],
        'min_child_weight': [1, 5, 10],
        'gamma': [1, 1.5, 2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

model_test = XGBRegressor(random_state=0)

skf = KFold(n_splits=5, shuffle = True, random_state = 0)
grid = GridSearchCV(model_test, param_grid=params, scoring='r2', n_jobs=-1, cv=skf, verbose=4)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
[CV 3/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6;, score=0.678 total time=   0.3s[CV 5/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=0.671 total time=   0.3s

[CV 5/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8;, score=0.675 total time=   0.3s
[CV 4/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.6;, score=0.630 total time=   0.3s
[CV 2/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=1.0;, score=0.607 total time=   0.3s
[CV 3/5] END colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample

  _data = np.array(data, dtype=dtype, copy=copy,


In [14]:
print('\n Best estimator:')
print(grid.best_estimator_)
print('\n Best score:')
print(grid.best_score_)
print('\n Best parameters:')
print(grid.best_params_)


 Best estimator:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=10, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=0, ...)

 Best score:
0.885656189918518

 Best parameters:
{'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 100, 'subsample': 0.6}


In [15]:
print(random_search.best_params_)

{'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 1.5, 'colsample_bytree': 0.8}
