In [59]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import make_pipeline, BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

In [41]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [42]:
class ProcessDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_result = X.copy().drop(['Id'], axis=1)
        ordered_levels = {
            "Alley": ["Grvl", "Pave"],
            "BsmtCond": ["Po", "Fa", "TA", "Gd"],
            "BsmtExposure": ["No", "Mn", "Av", "Gd"],
            "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
            "CentralAir": ["N", "Y"],
            "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
            "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "ExterQual": ["Fa", "TA", "Gd", "Ex"],
            "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
            "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageFinish": ["Unf", "RFn", "Fin"],
            "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
            "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
            "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
            "LotShape": ["IR3", "IR2", "IR1", "Reg"],
            "PavedDrive": ["N", "P", "Y"],
            "PoolQC": ["Fa", "Gd", "Ex"],
            "Street": ["Grvl", "Pave"],   
            "Utilities": ["NoSeWa", "AllPub"]}

        unordered_levels = {
            'BldgType': ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
            'Condition1': ['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA', 'RRNe'],
            'Condition2': ['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'],
            'Exterior1st': ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing', 'CemntBd', 'Plywood', 
                            'AsbShng','Stucco', 'BrkComm', 'AsphShn', 'Stone', 'ImStucc', 'CBlock'],
            'Exterior2nd': ['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng', 'CmentBd', 'BrkFace', 
                            'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc', 'AsphShn', 'Stone', 'Other', 'CBlock'],
            'Foundation': ['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'],
            'Functional': ['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev'],
            'GarageType': ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', '2Types'],
            'Heating': ['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'],
            'HouseStyle': ['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf', '2.5Fin'],
            'LandContour': ['Lvl', 'Bnk', 'Low', 'HLS'],
            'LandSlope': ['Gtl', 'Mod', 'Sev'],
            'LotConfig': ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'],
            'MSZoning': ['RL', 'RM', 'C (all)', 'FV', 'RH'],
            'MasVnrType': ['BrkFace', 'None', 'Stone', 'BrkCmn'],
            'MiscFeature': ['Shed', 'Gar2', 'Othr', 'TenC'],
            'Neighborhood': ['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 'NWAmes', 
                             'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes', 'SawyerW', 'IDOTRR', 
                             'MeadowV', 'Edwards', 'Timber', 'Gilbert', 'StoneBr', 'ClearCr', 'NPkVill', 
                             'Blmngtn', 'BrDale', 'SWISU', 'Blueste'],
            'RoofMatl': ['CompShg', 'WdShngl', 'Metal', 'WdShake', 'Membran', 'Tar&Grv', 'Roll', 'ClyTile'],
            'RoofStyle': ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'],
            'SaleCondition': ['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],
            'SaleType': ['WD', 'New', 'COD', 'ConLD', 'ConLI', 'CWD', 'ConLw', 'Con', 'Oth']}
        
        for c in X_result.columns:
            #convert ordered categorical to integers
            if c in ordered_levels:
                X_result[c] = X_result[c].astype('category', categories=ordered_levels[c], ordered=True).cat.codes
            #apply levels to unordered categorical columns
            if c in unordered_levels:
                X_result[c] = X_result[c].astype('category', categories=unordered_levels[c])
        
        #convert unordered categorical to dummy columns        
        X_result = pd.get_dummies(X_result)
        return X_result
    
    def fit_transform(self, X, y=None):
        result = self.transform(X)
        return result

In [43]:
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')            
    y = np.log(train['SalePrice'].values)
    X = train.drop(['SalePrice'], axis=1)
    X_submission = test
    ids_submission = test['Id'].values
    return y, X, X_submission, ids_submission

In [44]:
# load data
y, X, X_submission, ids_submission = load_data()

In [45]:
parameters = {'xgbregressor__max_depth': [5],
              'xgbregressor__learning_rate': [0.1],
              'xgbregressor__n_estimators': [200],
              'xgbregressor__min_child_weight': [2],
              'xgbregressor__subsample': [0.65],
              'xgbregressor__colsample_bytree': [0.65]}

pipeline = make_pipeline(ProcessDataFrame(),
                         Imputer(strategy='most_frequent'),
                         XGBRegressor(silent = True, objective='reg:linear', nthread=4, seed = 1773))

folds = KFold(10, random_state=42)

model = GridSearchCV(pipeline,
                     parameters, 
                     #n_iter=60, 
                     #random_state=1773,
                     scoring=make_scorer(rmse, greater_is_better=False), 
                     cv=folds)

In [47]:
%%time
model.fit(X,y);
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.117796324311
best parameters: {'xgbregressor__learning_rate': 0.1, 'xgbregressor__subsample': 0.65, 'xgbregressor__n_estimators': 200, 'xgbregressor__colsample_bytree': 0.65, 'xgbregressor__max_depth': 5, 'xgbregressor__min_child_weight': 2}
CPU times: user 27.1 s, sys: 386 ms, total: 27.5 s
Wall time: 7.14 s
