In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import Imputer

In [47]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [48]:
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    combined = train.append(test, ignore_index=True).drop(['Id','SalePrice'], axis=1)
    
    ##drop columns
    #combined.drop(['Utilities', '3SsnPorch'], axis=1, inplace=True)
    
    #categorical columns
    all_cat = combined.dtypes[combined.dtypes == 'object'].index
    
    #ordered categorical columns
    ordered_cat = {
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterQual": ["Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "PoolQC": ["Fa", "Gd", "Ex"],
    "CentralAir": ["N", "Y"],
    "Utilities": ["NoSeWa", "AllPub"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "LotShape": ["IR3", "IR2", "IR1", "Reg"],
    "Alley": ["Grvl", "Pave"],
    "Street": ["Grvl", "Pave"]
    }
    
    #unordered categorical columns
    unordered_cat = list(set(all_cat) - set(ordered_cat))
    
    #convert ordered categorical to integers
    for c in ordered_cat.keys():
        combined[c] = combined[c].astype('category', categories=ordered_cat[c], ordered=True).cat.codes
        
    #convert remaining unordered categorical to dummy columns        
    combined = pd.get_dummies(combined)
        
    #combined = combined.fillna(combined.mean())
    
    y = np.log(train['SalePrice'].values)
    X = combined.iloc[:train.shape[0],:]
    X_submission = combined.iloc[train.shape[0]:,:]
    ids_submission = test['Id'].values
    return y, X, X_submission, ids_submission

In [49]:
# load data
y, X, X_submission, ids_submission = load_data()

## Missing Values

In [50]:
(1 - X.count() / X.shape[0]).sort_values(ascending=False).head()

LotFrontage            0.177397
GarageYrBlt            0.055479
MasVnrArea             0.005479
Exterior1st_AsphShn    0.000000
Exterior1st_AsbShng    0.000000
dtype: float64

In [51]:
best_parameters = {'xgbregressor__max_depth': 5,
                   'xgbregressor__learning_rate': 0.1,
                   'xgbregressor__n_estimators': 100,
                   'xgbregressor__min_child_weight': 2,
                   'xgbregressor__subsample': 0.6,
                   'xgbregressor__colsample_bytree': 0.6}


parameters = {'xgbregressor__max_depth': [4,5,6],
              'xgbregressor__learning_rate': [0.1],
              'xgbregressor__n_estimators': [100],
              'xgbregressor__min_child_weight': [1,2,3],
              'xgbregressor__subsample': [0.5, 0.6, 0.7],
              'xgbregressor__colsample_bytree': [0.5, 0.6, 0.7]}

pipeline = make_pipeline(Imputer(strategy='most_frequent'),
                         XGBRegressor(silent = True, objective='reg:linear', nthread=4, seed = 1773))

model = RandomizedSearchCV(pipeline,
                           parameters, 
                           n_iter=60, 
                           random_state=1773,
                           scoring=make_scorer(rmse, greater_is_better=False), 
                           cv=KFold(10, random_state=1337))

In [52]:
pipeline = pipeline.set_params(**best_parameters)#.fit(X,y)

In [53]:
cv_scores = cross_val_score(pipeline, X, y, 
                                     scoring=make_scorer(rmse, greater_is_better=False), 
                                     cv=KFold(10, random_state=1337))

print 'score:', cv_scores.mean()
print 'std:  ', cv_scores.std()

score: -0.12456243611
std:   0.0163575772297


In [33]:
# %%time
# model.fit(X,y)

In [14]:
#0.119373653704
#0.121324499584 - Utility
#0.122664838897 - 

In [38]:
# print 'best score:', model.best_score_
# print 'best parameters:', model.best_params_

In [None]:
# # create submission predictions
# preds_submission = model.predict(X_submission)

# # save submission
# pd.DataFrame({'Id': ids_submission, 'SalePrice': np.exp(preds_submission)})\
#   .to_csv('../output/07_randomsearch_XGB.csv', index=False)

In [None]:
# %matplotlib inline
# from xgboost import plot_importance
# from matplotlib import pyplot

In [16]:
xgb_model = model.best_estimator_.steps[1][1]

In [17]:
import xgboost as xgb

In [18]:
import matplotlib.pyplot as plt



In [19]:
def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.booster().get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

In [20]:
var_imp = get_xgb_imp(xgb_model, X.columns)
var_imp = pd.DataFrame({'variable': var_imp.keys(), 'imp': var_imp.values()})

In [21]:
var_imp['main_variable'] = var_imp.variable.map(lambda x: x.split('_')[0])

In [22]:
var_imp.groupby('main_variable')['imp'].sum().sort_values(ascending=False)

main_variable
GrLivArea        0.060052
1stFlrSF         0.054830
LotArea          0.047650
Neighborhood     0.045039
BsmtFinSF1       0.044386
GarageArea       0.039164
TotalBsmtSF      0.037859
OverallQual      0.036554
BsmtUnfSF        0.035901
YearBuilt        0.032637
OverallCond      0.030679
2ndFlrSF         0.029373
SaleCondition    0.026762
MoSold           0.023499
OpenPorchSF      0.022846
GarageYrBlt      0.022846
LotFrontage      0.021540
YearRemodAdd     0.020888
YrSold           0.019582
BsmtExposure     0.019582
Exterior1st      0.015666
EnclosedPorch    0.014360
MasVnrArea       0.014360
MSZoning         0.013708
WoodDeckSF       0.013708
Condition1       0.013055
BsmtFinType1     0.012402
TotRmsAbvGrd     0.011097
Functional       0.010444
LandContour      0.009791
                   ...   
GarageCars       0.005222
Fireplaces       0.004569
FireplaceQu      0.004569
ExterQual        0.004569
KitchenAbvGr     0.004569
Heating          0.004569
CentralAir       0.00456

In [None]:
plot_importance(xgb_model);