In [421]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline, BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from scipy.stats import skew, boxcox

In [422]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [425]:
class ProcessDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self):
        return self
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_result = X.copy()
        ordered_levels = {
            "Alley": ["Grvl", "Pave"],
            "BsmtCond": ["Po", "Fa", "TA", "Gd"],
            "BsmtExposure": ["No", "Mn", "Av", "Gd"],
            "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
            "CentralAir": ["N", "Y"],
            "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
            "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "ExterQual": ["Fa", "TA", "Gd", "Ex"],
            "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
            "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageFinish": ["Unf", "RFn", "Fin"],
            "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
            "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
            "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
            "LotShape": ["IR3", "IR2", "IR1", "Reg"],
            "PavedDrive": ["N", "P", "Y"],
            "PoolQC": ["Fa", "Gd", "Ex"],
            "Street": ["Grvl", "Pave"],   
            "Utilities": ["NoSeWa", "AllPub"]}

        unordered_levels = {
            'BldgType': ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
            'Condition1': ['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA', 'RRNe'],
            'Condition2': ['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'],
            'Exterior1st': ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing', 'CemntBd', 'Plywood', 
                            'AsbShng','Stucco', 'BrkComm', 'AsphShn', 'Stone', 'ImStucc', 'CBlock'],
            'Exterior2nd': ['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng', 'CmentBd', 'BrkFace', 
                            'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc', 'AsphShn', 'Stone', 'Other', 'CBlock'],
            'Foundation': ['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'],
            'Functional': ['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev'],
            'GarageType': ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', '2Types'],
            'Heating': ['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'],
            'HouseStyle': ['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf', '2.5Fin'],
            'LandContour': ['Lvl', 'Bnk', 'Low', 'HLS'],
            'LandSlope': ['Gtl', 'Mod', 'Sev'],
            'LotConfig': ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'],
            'MSZoning': ['RL', 'RM', 'C (all)', 'FV', 'RH'],
            'MasVnrType': ['BrkFace', 'None', 'Stone', 'BrkCmn'],
            'MiscFeature': ['Shed', 'Gar2', 'Othr', 'TenC'],
            'Neighborhood': ['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 'NWAmes', 
                             'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes', 'SawyerW', 'IDOTRR', 
                             'MeadowV', 'Edwards', 'Timber', 'Gilbert', 'StoneBr', 'ClearCr', 'NPkVill', 
                             'Blmngtn', 'BrDale', 'SWISU', 'Blueste'],
            'RoofMatl': ['CompShg', 'WdShngl', 'Metal', 'WdShake', 'Membran', 'Tar&Grv', 'Roll', 'ClyTile'],
            'RoofStyle': ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'],
            'SaleCondition': ['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],
            'SaleType': ['WD', 'New', 'COD', 'ConLD', 'ConLI', 'CWD', 'ConLw', 'Con', 'Oth']}
        
        #convert ordered categorical to integers
        for c in ordered_levels.keys():
            X_result[c] = X_result[c].astype('category', categories=ordered_levels[c], ordered=True).cat.codes
        
        #convert remaining unordered categorical to dummy columns        
        X_result = pd.get_dummies(X_result)
        return X_result

In [410]:
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    combined = train.append(test, ignore_index=True).drop(['Id','SalePrice'], axis=1)
        
    #categorical columns
    all_cat = combined.dtypes[combined.dtypes == 'object'].index
        
    #ordered categorical columns
    ordered_cat = {
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "PoolQC": ["Fa", "Gd", "Ex"],
    "CentralAir": ["N", "Y"],
    "Utilities": ["NoSeWa", "AllPub"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Alley": ["Grvl", "Pave"],
    "Street": ["Grvl", "Pave"]
    }
    
    #unordered categorical columns
    unordered_cat = list(set(all_cat) - set(ordered_cat))
    
    #convert ordered categorical to integers
    for c in ordered_cat.keys():
        combined[c] = combined[c].astype('category', categories=ordered_cat[c], ordered=True).cat.codes
        
    #convert remaining unordered categorical to dummy columns        
    combined = pd.get_dummies(combined)
            
    y = np.log(train['SalePrice'].values)
    X = combined.iloc[:train.shape[0],:]
    X_submission = combined.iloc[train.shape[0]:,:]
    ids_submission = test['Id'].values
    return y, X, X_submission, ids_submission

In [411]:
# load data
y, X, X_submission, ids_submission = load_data()

## Missing Values

In [412]:
(1 - X.count() / X.shape[0]).sort_values(ascending=False).head()

LotFrontage            0.177397
GarageYrBlt            0.055479
MasVnrArea             0.005479
Exterior1st_BrkFace    0.000000
Exterior1st_BrkComm    0.000000
dtype: float64

In [413]:
parameters = {'xgbregressor__max_depth': [6],
              'xgbregressor__learning_rate': [0.1],
              'xgbregressor__n_estimators': [100],
              'xgbregressor__min_child_weight': [2],
              'xgbregressor__subsample': [0.65],
              'xgbregressor__colsample_bytree': [0.65]}

pipeline = make_pipeline(Imputer(strategy='most_frequent'),
                         XGBRegressor(silent = True, objective='reg:linear', nthread=4, seed = 1773))

model = GridSearchCV(pipeline,
                     parameters, 
                     #n_iter=60, 
                     #random_state=1773,
                     scoring=make_scorer(rmse, greater_is_better=False), 
                     cv=KFold(10, random_state=42))

In [414]:
#0.121633129956

In [415]:
%%time
model.fit(X,y);
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.122200874318
best parameters: {'xgbregressor__learning_rate': 0.1, 'xgbregressor__subsample': 0.65, 'xgbregressor__n_estimators': 100, 'xgbregressor__colsample_bytree': 0.65, 'xgbregressor__max_depth': 6, 'xgbregressor__min_child_weight': 2}
CPU times: user 13.1 s, sys: 113 ms, total: 13.2 s
Wall time: 3.45 s


In [332]:
base_case = X.median()

In [322]:
#BldgType
base_case[X.columns[X.columns.map(lambda x: x.split('_')[0] == 'BldgType')]]

BldgType_1Fam      1.0
BldgType_2fmCon    0.0
BldgType_Duplex    0.0
BldgType_Twnhs     0.0
BldgType_TwnhsE    0.0
dtype: float64

In [341]:
BldgType_1Fam = base_case
BldgType_2fmCon = base_case.set_value('BldgType_1Fam', 0).set_value('BldgType_2fmCon', 1)
BldgType_Duplex = base_case.set_value('BldgType_1Fam', 0).set_value('BldgType_Duplex', 1)
BldgType_Twnhs = base_case.set_value('BldgType_1Fam', 0).set_value('BldgType_Twnhs', 1)
BldgType_TwnhsE = base_case.set_value('BldgType_1Fam', 0).set_value('BldgType_TwnhsE', 1)

In [342]:
print model.predict(BldgType_1Fam.reshape(1,-1))
print model.predict(BldgType_2fmCon.reshape(1,-1))
print model.predict(BldgType_Duplex.reshape(1,-1))
print model.predict(BldgType_Twnhs.reshape(1,-1))
print model.predict(BldgType_TwnhsE.reshape(1,-1))

[ 11.99930954]
[ 11.99930954]
[ 11.99930954]
[ 11.99930954]
[ 11.99930954]


In [326]:
base_case.set_value('BldgType_1Fam', 0).set_value('BldgType_BldgType_2fmCon', 1)

1stFlrSF                    1087.0
2ndFlrSF                       0.0
3SsnPorch                      0.0
Alley                          0.0
BedroomAbvGr                   3.0
BsmtCond                       2.0
BsmtExposure                   0.0
BsmtFinSF1                   383.5
BsmtFinSF2                     0.0
BsmtFinType1                   3.0
BsmtFinType2                   0.0
BsmtFullBath                   0.0
BsmtHalfBath                   0.0
BsmtQual                       2.0
BsmtUnfSF                    477.5
CentralAir                     1.0
Electrical                     4.0
EnclosedPorch                  0.0
ExterCond                      2.0
ExterQual                      1.0
Fence                         -1.0
FireplaceQu                    1.0
Fireplaces                     1.0
FullBath                       2.0
GarageArea                   480.0
GarageCars                     2.0
GarageCond                     2.0
GarageFinish                   1.0
GarageQual          

In [220]:
# # create submission predictions
# preds_submission = model.predict(X_submission)

# # save submission
# pd.DataFrame({'Id': ids_submission, 'SalePrice': np.exp(preds_submission)})\
#   .to_csv('../output/07_randomsearch_XGB.csv', index=False)

In [23]:
# %matplotlib inline
# import matplotlib.pyplot as plt

# plt.plot(np.sort(model.cv_results_['mean_test_score']));

In [24]:
%matplotlib inline
from xgboost import plot_importance
from matplotlib import pyplot



In [25]:
xgb_model = model.best_estimator_.steps[1][1]

In [26]:
import xgboost as xgb

In [27]:
import matplotlib.pyplot as plt

In [28]:
def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

In [29]:
var_imp = get_xgb_imp(xgb_model, X.columns)
var_imp = pd.DataFrame({'variable': var_imp.keys(), 'imp': var_imp.values()})

In [30]:
var_imp['main_variable'] = var_imp.variable.map(lambda x: x.split('_')[0])

In [31]:
var_imp.groupby('main_variable')['imp'].sum().sort_values(ascending=False)

main_variable
GrLivArea        0.060052
1stFlrSF         0.054830
LotArea          0.047650
Neighborhood     0.045039
BsmtFinSF1       0.044386
GarageArea       0.039164
TotalBsmtSF      0.037859
OverallQual      0.036554
BsmtUnfSF        0.035901
YearBuilt        0.032637
OverallCond      0.030679
2ndFlrSF         0.029373
SaleCondition    0.026762
MoSold           0.023499
OpenPorchSF      0.022846
GarageYrBlt      0.022846
LotFrontage      0.021540
YearRemodAdd     0.020888
YrSold           0.019582
BsmtExposure     0.019582
Exterior1st      0.015666
EnclosedPorch    0.014360
MasVnrArea       0.014360
MSZoning         0.013708
WoodDeckSF       0.013708
Condition1       0.013055
BsmtFinType1     0.012402
TotRmsAbvGrd     0.011097
Functional       0.010444
LandContour      0.009791
                   ...   
GarageCars       0.005222
Fireplaces       0.004569
FireplaceQu      0.004569
ExterQual        0.004569
KitchenAbvGr     0.004569
Heating          0.004569
CentralAir       0.00456

In [None]:
plot_importance(xgb_model);