In [65]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
from random import Random
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, cross_val_predict
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import make_pipeline, BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, KernelCenterer, StandardScaler
from scipy.stats import hmean, skew, boxcox
from scipy.optimize import fmin_cobyla

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [39]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train["MSSubClass"] = train["MSSubClass"].astype('object')

In [71]:
class CategoricalFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        
        X = X[['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr', 'BldgType', 'BsmtCond',
               'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath',
               'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
               'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd',
               'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation', 'FullBath', 'Functional',
               'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
               'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle',
               'KitchenAbvGr', 'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
               'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea',
               'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood', 'OpenPorchSF',
               'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'PoolQC', 'RoofMatl',
               'RoofStyle', 'SaleCondition', 'SaleType', 'ScreenPorch', 'Street', 'TotRmsAbvGrd',
               'TotalBsmtSF', 'Utilities', 'WoodDeckSF', 'YearBuilt', 'YearRemodAdd', 'YrSold']]
        
        # MSSubClass is a categorical and need to cast to object
        X["MSSubClass"] = X["MSSubClass"].astype('object')
        
        categorical = {
            "ordered": {
                "Alley": ["Grvl", "Pave"],
                "BsmtCond": ["Po", "Fa", "TA", "Gd"],
                "BsmtExposure": ["No", "Mn", "Av", "Gd"],
                "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
                "CentralAir": ["N", "Y"],
                "Electrical": ["FuseP", "FuseF", "FuseA", "Mix", "SBrkr"],
                "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
                "ExterQual": ["Fa", "TA", "Gd", "Ex"],
                "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
                "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
                'Functional': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
                "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
                "GarageFinish": ["Unf", "RFn", "Fin"],
                "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
                "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
                "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
                "LandSlope": ["Sev", "Mod", "Gtl"],
                "LotShape": ["IR3", "IR2", "IR1", "Reg"],
                "PavedDrive": ["N", "P", "Y"],
                "PoolQC": ["Fa", "Gd", "Ex"],
                "Street": ["Grvl", "Pave"],   
                "Utilities": ["NoSeWa", "AllPub"]},
            "unordered": {
                "BldgType": ["1Fam", "2fmCon", "Duplex", "TwnhsE", "Twnhs"],
                "Exterior1st": ["VinylSd", "MetalSd", "Wd Sdng", "HdBoard", "BrkFace", "WdShing", "CemntBd", "Plywood", "AsbShng", "Stucco", "BrkComm", "AsphShn", "Stone", "ImStucc", "CBlock"],
                "Exterior2nd": ["VinylSd", "MetalSd", "Wd Shng", "HdBoard", "Plywood", "Wd Sdng", "CmentBd", "BrkFace", "Stucco", "AsbShng", "Brk Cmn", "ImStucc", "AsphShn", "Stone", "Other", "CBlock"],
                "Condition1": ["Norm", "Feedr", "PosN", "Artery", "RRAe", "RRNn", "RRAn", "PosA", "RRNe"],
                "Condition2": ["Norm", "Artery", "RRNn", "Feedr", "PosN", "PosA", "RRAn", "RRAe"],
                "Foundation": ["PConc", "CBlock", "BrkTil", "Wood", "Slab", "Stone"],
                "GarageType": ["Attchd", "Detchd", "BuiltIn", "CarPort", "Basment", "2Types"],
                "Heating": ["GasA", "GasW", "Grav", "Wall", "OthW", "Floor"],
                "HouseStyle": ["2Story", "1Story", "1.5Fin", "1.5Unf", "SFoyer", "SLvl", "2.5Unf", "2.5Fin"],
                "LandContour": ["Lvl", "Bnk", "Low", "HLS"],
                "LotConfig": ["Inside", "FR2", "Corner", "CulDSac", "FR3"],
                "MasVnrType": ["BrkFace", "None", "Stone", "BrkCmn"],
                "MSSubClass": [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190],
                "MSZoning": ["RL", "RM", "C (all)", "FV", "RH"],
                "Neighborhood": ["CollgCr", "Veenker", "Crawfor", "NoRidge", "Mitchel", "Somerst", "NWAmes", "OldTown", "BrkSide", "Sawyer", "NridgHt", "NAmes", "SawyerW", "IDOTRR", "MeadowV", "Edwards", "Timber", "Gilbert", "StoneBr", "ClearCr", "NPkVill", "Blmngtn", "BrDale", "SWISU", "Blueste"],
                "MiscFeature": ["Shed", "Gar2", "Othr", "TenC"],
                "RoofMatl": ["CompShg", "WdShngl", "Metal", "WdShake", "Membran", "Tar&Grv", "Roll", "ClyTile"],
                "RoofStyle": ["Gable", "Hip", "Gambrel", "Mansard", "Flat", "Shed"],
                "SaleCondition": ["Normal", "Abnorml", "Partial", "AdjLand", "Alloca", "Family"],
                "SaleType": ["WD", "New", "COD", "ConLD", "ConLI", "CWD", "ConLw", "Con", "Oth"]}}
        
        for c in X.columns:
            if c in categorical["ordered"]:
                X[c] = X[c].astype("category", categories=categorical["ordered"][c], ordered=True)
            elif c in categorical["unordered"]:
                X[c] = X[c].astype("category", categories=categorical["unordered"][c])
                
        return X

In [70]:
class TreeFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X["HasFireplace"] = 1 - X["FireplaceQu"].isnull() * 1
        X["AttchdGarage"] = (X['GarageType'] == "Attchd") * 1
                
        for c in X.columns:
            if X[c].dtype.name == 'category':
                if X[c].cat.ordered:
                    X[c] = X[c].cat.codes
                    
        return pd.get_dummies(X)

In [75]:
class LinearFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X["HasFireplace"] = 1 - X["FireplaceQu"].isnull() * 1
        X["AttchdGarage"] = (X['GarageType'] == "Attchd") * 1
        
        for c in X.columns:
            if X[c].dtype.name == 'category':
                if X[c].cat.ordered:
                    X[c] = X[c].cat.codes
                    
        # skewed columns
        for c in ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF2', '1stFlrSF', 
                  'GrLivArea', 'KitchenAbvGr', 'OpenPorchSF', 'PoolArea', 'MiscVal']:
            X[c] = np.log1p(X[c])
                                            
        return pd.get_dummies(X, drop_first=True)

In [126]:
model_bagged_lasso = make_pipeline(CategoricalFeatures(),
                                   LinearFeatures(),
                                   Imputer(),
                                   StandardScaler(),
                                   Lasso())

params = {'lasso__alpha': [0.002, 0.004, 0.006]}

grid = GridSearchCV(model_bagged_lasso, params, scoring="neg_mean_squared_error", cv=5)

In [127]:
grid.fit(train, np.log(train.SalePrice))
print "RMSE: {}".format(np.sqrt(-grid.best_score_))
print "params: {}".format(grid.best_params_)

RMSE: 0.129986652351
params: {'lasso__alpha': 0.004}


In [17]:
model_xgb = make_pipeline(ProcessTreeData(),
                      Imputer(strategy='most_frequent'),
                      XGBRegressor(silent = True, 
                                   objective='reg:linear', 
                                   seed=1773,
                                   max_depth=5,
                                   nthread=8,
                                   learning_rate=0.05,
                                   n_estimators=500,
                                   min_child_weight=1,
                                   subsample=0.65,
                                   colsample_bytree=0.65))

model_gbm = make_pipeline(ProcessTreeData(),
                      Imputer(strategy='most_frequent'),
                      GradientBoostingRegressor(random_state=1773, 
                                                learning_rate=0.1,
                                                max_depth=4, 
                                                max_features=0.7, 
                                                min_samples_leaf=2,
                                                n_estimators=250,
                                                subsample=0.75
                                               ))

model_gbm = make_pipeline(ProcessTreeData(),
                      Imputer(strategy='most_frequent'),
                      GradientBoostingRegressor(random_state=1773,
                                                learning_rate=0.1,
                                                max_depth=4,
                                                max_features=0.7,
                                                min_samples_leaf=1,
                                                n_estimators=250,
                                                subsample=0.75))

model_et = make_pipeline(ProcessTreeData(),
                      Imputer(strategy='most_frequent'),
                      ExtraTreesRegressor(n_estimators=250,
                                          max_depth=14, 
                                          n_jobs=8,
                                          random_state=1773, 
                                          max_features=0.7
                                         ))

model_en = make_pipeline(ProcessLinearData(),
                         Imputer(strategy='most_frequent'),
                         StandardScaler(),
                         ElasticNet(l1_ratio=0.4, alpha=0.0009))

In [18]:
results_en_01 = get_oof_preds(model_en, X, y, X_submission, n_folds=5, n_iter=3, seed=1337)
results_en_02 = get_oof_preds(model_en, X, y, X_submission, n_folds=10, n_iter=3, seed=1337)
print 'rmse - en  5: ', rmse(y, results_en_01[0])
print 'rmse - en 10: ', rmse(y, results_en_02[0])

rmse - en  5:  0.133634724721
rmse - en 10:  0.133665158334


In [19]:
results_xgb_01 = get_oof_preds(model_xgb, X, y, X_submission, n_folds=5, n_iter=3, seed=1337)
results_xgb_02 = get_oof_preds(model_xgb, X, y, X_submission, n_folds=10, n_iter=3, seed=1337)
print 'rmse - xgb  5: ', rmse(y, results_xgb_01[0])
print 'rmse - xgb 10: ', rmse(y, results_xgb_02[0])

rmse - xgb  5:  0.120093131042
rmse - xgb 10:  0.120525114003


In [20]:
results_et_01 = get_oof_preds(model_et, X, y, X_submission, n_folds=5, n_iter=3, seed=1337)
results_et_02 = get_oof_preds(model_et, X, y, X_submission, n_folds=10, n_iter=3, seed=1337)
print 'rmse - et   5: ', rmse(y, results_et_01[0])
print 'rmse - et  10: ', rmse(y, results_et_02[0])

rmse - et   5:  0.136789995158
rmse - et  10:  0.136847957427


In [21]:
results_gbm_01 = get_oof_preds(model_gbm, X, y, X_submission, n_folds=5, n_iter=3, seed=1337)
results_gbm_02 = get_oof_preds(model_gbm, X, y, X_submission, n_folds=10, n_iter=3, seed=1337)
print 'rmse - gbm  5: ', rmse(y, results_gbm_01[0])
print 'rmse - gbm 10: ', rmse(y, results_gbm_02[0])

rmse - gbm  5:  0.121050217984
rmse - gbm 10:  0.12098865258


## blend results

In [23]:
def blend_results(model_results, y, scorer):
    results_train = [x[0] for x in model_results]
    results_test = [x[1] for x in model_results]    
    
    def blended(p, x):
        """blend model results using weights(p)"""
        result = None
        for i in range(len(p)):
            result = result + p[i] * x[i] if result is not None else p[i] * x[i]
        result /= sum(p)
        return result
    
    def constraint(p, *args):
        """constrain to positive weights"""
        return min(p) - .0
    
    def error(p, x, y):
        """error function to optimize"""
        preds = blended(p, x)
        err = scorer(y, preds)
        return err
    
    # initial weights
    p0 = np.array([1.] * len(model_results)) / len(model_results)
    
    p = fmin_cobyla(error, p0, args=(results_train, y), cons=[constraint], rhoend=1e-5)
    print 'weights:', np.array(p) / sum(p)
    
    return blended(p, results_train), blended(p, results_test)

In [24]:
results_all = [results_xgb_01, results_xgb_02, 
               results_gbm_01, results_gbm_02,
               results_et_02, results_en_02
              ]
results_blended = blend_results(results_all, y, rmse)

weights: [ 0.30410503  0.06390163  0.13652156  0.24959252  0.0089372   0.23694207]


In [None]:
# 0.121962634859
# 0.12366116241
# 0.126804423909
# 0.124402174902
# 0.137316252924
# 0.119642192352

In [25]:
for x in results_all:
    print rmse(y, x[0])
print rmse(y, results_blended[0])

0.120093131042
0.120525114003
0.121050217984
0.12098865258
0.136847957427
0.133665158334
0.11703525582


In [26]:
np.max(np.abs(y - results_blended[0]))

1.1402416018561308

In [487]:
pd.DataFrame({'Id': ids_submission, 'SalePrice': np.exp(results_blended[1])}).to_csv('../ensemble/models/blended_sub.csv', index=False)