In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error



In [2]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [3]:
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    combined = train.append(test, ignore_index=True).drop(['Id','SalePrice'], axis=1)
    
    #drop columns
    combined.drop(['Utilities'], axis=1, inplace=True)
    
    #categorical columns
    all_cat = combined.dtypes[combined.dtypes == 'object'].index
    
    #ordered categorical columns
    ordered_cat = {
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "PoolQC": ["Fa", "Gd", "Ex"],
    "CentralAir": ["N", "Y"],
    #"Utilities": ["NoSeWa", "AllPub"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Alley": ["Grvl", "Pave"],
    "Street": ["Grvl", "Pave"]}
    
    #unordered categorical columns
    unordered_cat = list(set(all_cat) - set(ordered_cat))
    
    #convert ordered categorical to integers
    for c in ordered_cat.keys():
        combined[c] = combined[c].astype('category', categories=ordered_cat[c], ordered=True).cat.codes

    #convert remaining unordered categorical to dummy columns        
    combined = pd.get_dummies(combined)
    
    #combined = combined.fillna(combined.mean())
    
    y = np.log(train['SalePrice'].values)
    X = combined.iloc[:train.shape[0],:]
    X_submission = combined.iloc[train.shape[0]:,:]
    ids_submission = test['Id'].values
    return y, X, X_submission, ids_submission

In [4]:
# load data
y, X, X_submission, ids_submission = load_data()

In [5]:
parameters = {'max_depth': [5,7,9,11,13,15],
              'learning_rate': [0.5, 0.1, 0.2, 0.3],
              'n_estimators': [100],
              'min_child_weight': [1,2,4,8],
              'subsample': [0.4, 0.6, 0.8],
              'colsample_bytree': [0.4, 0.6, 0.8]}

model = RandomizedSearchCV(XGBRegressor(silent = True, objective='reg:linear', nthread=4, seed = 1337),
                           parameters, 
                           n_iter=60, 
                           random_state=1773,
                           scoring=make_scorer(rmse, greater_is_better=False), 
                           cv=KFold(10, random_state=1773))

In [6]:
%%time
model.fit(X,y)

CPU times: user 13min 32s, sys: 7.8 s, total: 13min 40s
Wall time: 3min 30s


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=1773, shuffle=False),
          error_score='raise',
          estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1337, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=60, n_jobs=1,
          param_distributions={'colsample_bytree': [0.4, 0.6, 0.8], 'learning_rate': [0.5, 0.1, 0.2, 0.3], 'min_child_weight': [1, 2, 4, 8], 'n_estimators': [100], 'subsample': [0.4, 0.6, 0.8], 'max_depth': [5, 7, 9, 11, 13, 15]},
          pre_dispatch='2*n_jobs', random_state=1773, refit=True,
          return_train_score=True,
          scoring=make_scorer(rmse, greater_is_better=False), verbose=0)

In [None]:
#-0.120212531158

In [7]:
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.120212531158
best parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.6, 'max_depth': 5}


In [None]:
# create submission predictions
preds_submission = model.predict(X_submission)

# save submission
pd.DataFrame({'Id': ids_submission, 'SalePrice': np.exp(preds_submission)})\
  .to_csv('../output/07_randomsearch_XGB.csv', index=False)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(np.sort(model.cv_results_['mean_test_score']));