In [None]:
#! pip install git+git://github.com/hyperopt/hyperopt-sklearn.git

In [3]:
from __future__ import print_function
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from hyperopt import tpe
import hpsklearn
import sys
import pandas as pd
from sklearn.linear_model import Lasso, ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict , KFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import hmean
from tqdm import tnrange, tqdm_notebook, tqdm
import xgboost as xgb
import forum_features

## scoring methods

In [4]:
def rmse_cv(model, X, y, n_folds=5):
    cv_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', 
                                cv=KFold(n_folds, shuffle=True, random_state=42))
    print('mean: {:.4f}'.format(np.mean(np.sqrt(-cv_scores))))
    print('std:  {:.4f}'.format(np.std(np.sqrt(-cv_scores))))

In [5]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [6]:
def rmse_cv_avg(model, X, y, n_folds=5):
    y_pred = cross_val_predict(model, X, y, cv=KFold(n_folds, shuffle=True, random_state=42))
    score = rmse(y, y_pred)
    print('RMSE: {:.4f}'.format(score))

## load data

In [7]:
X, y, X_submission, ids_submission = forum_features.load_data()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=.25, random_state=1)

## find new models

In [35]:
estimator = hpsklearn.HyperoptEstimator(
    preprocessing=hpsklearn.components.any_preprocessing('pp'),
    regressor=hpsklearn.components.any_regressor('reg'),
    loss_fn=rmse,
    algo=tpe.suggest,
    trial_timeout=300.0,
    max_evals=120,
    seed=1337
)

In [36]:
iterator = estimator.fit_iter(X_train, y_train)
next(iterator)

best = np.infty

bar = tnrange(estimator.max_evals, desc='trial')

for n_trial in bar:
    iterator.send(1)  # -- try one more model
    trial_score = estimator.trials.losses()[-1]
    if trial_score < best:
        best = trial_score
        bar.write('Trial {} loss: {}'.format(n_trial, best))

Trial 0 loss: 12.0044842971
Trial 1 loss: 1.40082119653
Trial 2 loss: 0.373522174397
Trial 3 loss: 0.253603385891
Trial 4 loss: 0.229742816947
Trial 5 loss: 0.198690352756
Trial 6 loss: 0.159869649397
Trial 7 loss: 0.155089062591
Trial 8 loss: 0.148801063361
Trial 9 loss: 0.145863561449
Trial 12 loss: 0.142887522448
Trial 13 loss: 0.108873930901
Trial 20 loss: 0.107778105737
Trial 69 loss: 0.107019906189
Trial 78 loss: None



In [37]:
model = estimator.best_model()
model

{'ex_preprocs': (),
 'learner': SVR(C=1.95702079447, cache_size=512, coef0=0.0, degree=1,
   epsilon=0.0116084150734, gamma=0.00363369397057, kernel='rbf',
   max_iter=166519567.0, shrinking=True, tol=0.0016221625196, verbose=False),
 'preprocs': (StandardScaler(copy=True, with_mean=False, with_std=False),)}

## models to review

In [None]:
reg1 = BaggingRegressor(Lasso(alpha=0.00015, max_iter=5000),
                        random_state=1337,
                        n_estimators=100,
                        oob_score=True,
                        max_samples=0.4,
                        max_features=1.0)

In [None]:
reg1 = BaggingRegressor(Lasso(alpha=0.00015, max_iter=5000),
                        random_state=1337,
                        n_estimators=100,
                        oob_score=True,
                        max_samples=0.4,
                        max_features=1.0)

In [287]:
model1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8, max_features=0.7, min_samples_leaf=2),
                           n_estimators=150, 
                           learning_rate=0.8, 
                           loss='linear', 
                           random_state=1337)

In [288]:
#0.1333

In [289]:
%%time
rmse_cv_avg(model1, X, y, n_folds=5)

RMSE: 0.1325
CPU times: user 30.9 s, sys: 142 ms, total: 31.1 s
Wall time: 31.3 s


In [37]:
reg3=xgb.XGBRegressor(colsample_bytree=0.4,
                      gamma=0.045,
                      learning_rate=0.05, #0.07
                      max_depth=20,
                      min_child_weight=1.5,
                      n_estimators=1500, #300
                      reg_alpha=0.65,
                      reg_lambda=0.45,
                      subsample=0.95)

params = {'colsample_bytree': [0.4],
          'max_depth': [16],
          'min_child_weight': [3],
          'subsample': [0.95],
          'gamma': [0.04],
          'reg_alpha': [0.65],
          'reg_lambda': [0.5]
}

grid = GridSearchCV(reg3, params, scoring='neg_mean_squared_error', 
                    cv=KFold(10, shuffle=True, random_state=42))
grid.fit(X,y)

print('score: {}'.format(np.sqrt(-grid.best_score_)))
print('params: {}'.format(grid.best_params_))

score: 0.120812359806
params: {'reg_alpha': 0.65, 'colsample_bytree': 0.4, 'min_child_weight': 3, 'subsample': 0.95, 'reg_lambda': 0.5, 'max_depth': 16, 'gamma': 0.04}


In [19]:
#reg3.get_params()

In [None]:
#0.120812359806 - 1500
#0.120897432807 - 1200
#0.121032529194 - 900
#0.121150609378 - 700
#0.121244175186 - 600
#0.121350323311 - 500
#0.121530616329 - 400
#0.121825438415 - 300
#0.121774543292
#0.126984446284 - 0.15
#0.123554835113

In [52]:
model3 = make_pipeline(StandardScaler(copy=True, with_mean=False, with_std=False),
                       SVR(C=1.95702079447, cache_size=512, coef0=0.0, degree=1,
                           epsilon=0.0116084150734, gamma=0.00363369397057, kernel='rbf',
                           max_iter=166519567.0, shrinking=True, tol=0.0016221625196, verbose=False))

In [84]:
params = {'svr__C': [4.50, 4.75, 5.00],
          'svr__gamma': [0.00105, 0.00115, 0.00125],
          'svr__epsilon': [0.0415, 0.0425, 0.0435]}

# 'svr__C': 1.95702079447,
#  'svr__cache_size': 512,
#  'svr__coef0': 0.0,
#  'svr__degree': 1,
#  'svr__epsilon': 0.0116084150734,
#  'svr__gamma': 0.00363369397057,
#  'svr__kernel': 'rbf',
#  'svr__max_iter': 166519567.0,
#  'svr__shrinking': True,
#  'svr__tol': 0.0016221625196,

grid = GridSearchCV(model3, params, scoring='neg_mean_squared_error', 
                    cv=KFold(10, shuffle=True, random_state=42))
grid.fit(X,y)

print('score: {}'.format(np.sqrt(-grid.best_score_)))
print('params: {}'.format(grid.best_params_))

score: 0.108641420758
params: {'svr__gamma': 0.00115, 'svr__epsilon': 0.0435, 'svr__C': 5.0}


In [75]:
# score: 0.10864547849
# params: {'svr__gamma': 0.00115, 'svr__epsilon': 0.0425, 'svr__C': 4.75}

In [61]:
%%time
rmse_cv_avg(model3, X, y, n_folds=10)

RMSE: 0.1121
CPU times: user 4.8 s, sys: 34.4 ms, total: 4.83 s
Wall time: 4.84 s


## models to keep

In [62]:
def oob_predictions(model, X, y, X_sub, n_folds=5):
    folds = cv=KFold(n_folds, shuffle=True, random_state=42).split(X)
    
    if type(X).__name__ == 'SparseDataFrame':
        X = X.values
        X_sub = X.values
        
    if type(y).__name__ == 'Series':
        y = y.values
    
    train_pred = np.zeros(np.shape(X)[0])
    test_pred_i = np.zeros((np.shape(X_sub)[0], n_folds))
        
    for i in tnrange(n_folds, desc='split'):
        train_index, test_index = folds.next()
        X_train = X[train_index,:]
        X_test = X[test_index,:]
        y_train = y[train_index]
        model.fit(X_train, y_train)
        train_pred[test_index] = model.predict(X_test)
        test_pred_i[:,i] = model.predict(X_sub)
        
    test_pred = hmean(test_pred_i, axis=1)
    
    print('RMSE: {}'.format(rmse(y, train_pred)))
    
    return train_pred, test_pred

### \*\* bagged lasso **

In [39]:
reg1 = BaggingRegressor(Lasso(alpha=0.00013, #0.00015
                              max_iter=10000),
                        random_state=1337,
                        n_estimators=100,
                        oob_score=True,
                        max_samples=0.4,
                        max_features=1.0)

In [30]:
#0.109055823916

In [40]:
# %%time
# reg1.fit(X,y)
# print('RMSE:{}'.format(rmse(y, reg1.oob_prediction_)))

In [12]:
# mean: 0.1084
# std:  0.0174
# CPU times: user 2min 22s, sys: 898 ms, total: 2min 23s
# Wall time: 2min 23s

In [32]:
# %%time
# rmse_cv(reg1, X, y, n_folds=10)

mean: 0.1084
std:  0.0174
CPU times: user 2min 39s, sys: 1.22 s, total: 2min 40s
Wall time: 2min 41s


### \*\* bagged elastic net **

In [41]:
reg2 = BaggingRegressor(ElasticNet(alpha=0.00013, l1_ratio=1.0, max_iter=3000),
                        random_state=1337,
                        n_estimators=100,
                        oob_score=True,
                        max_samples=0.4,
                        max_features=1.0)

In [None]:
# mean: 0.1094
# std:  0.0157
# CPU times: user 2min 40s, sys: 854 ms, total: 2min 41s
# Wall time: 2min 41s

In [34]:
%%time
rmse_cv(reg2, X, y, n_folds=10)

mean: 0.1084
std:  0.0174
CPU times: user 2min 33s, sys: 1.07 s, total: 2min 34s
Wall time: 2min 35s


### \*\* xgboost **

In [42]:
reg3=xgb.XGBRegressor(colsample_bytree=0.4,
                      gamma=0.04,
                      learning_rate=0.05,
                      max_depth=16,
                      min_child_weight=3,
                      n_estimators=1500,
                      reg_alpha=0.65,
                      reg_lambda=0.5,
                      subsample=0.95)

In [212]:
# mean: 0.1233
# std:  0.0136
# CPU times: user 1min 19s, sys: 214 ms, total: 1min 19s
# Wall time: 1min 19s

In [213]:
# %%time
# rmse_cv(reg3, X, y, n_folds=10)

### \*\* svm **

In [43]:
reg4 = make_pipeline(StandardScaler(copy=True, with_mean=False, with_std=False),
                     SVR(C=5.0, cache_size=512, coef0=0.0, degree=1,
                         epsilon=0.0435, gamma=0.00115, kernel='rbf',
                         max_iter=166519567.0, shrinking=True, tol=0.0016221625196, verbose=False))

In [216]:
# mean: 0.1104
# std:  0.0154
# CPU times: user 10.4 s, sys: 67.2 ms, total: 10.5 s
# Wall time: 10.5 s

In [86]:
%%time
rmse_cv(reg4, X, y, n_folds=10)

mean: 0.1072
std:  0.0173
CPU times: user 7.76 s, sys: 49.1 ms, total: 7.81 s
Wall time: 7.85 s


## blend results

In [44]:
regs = [reg1, reg3, reg4]

In [45]:
%%time
reg_preds = [oob_predictions(reg, X, y, X_submission, 10) for reg in regs]


RMSE: 0.109832474852

RMSE: 0.120812359806

RMSE: 0.108641420758
CPU times: user 9min 4s, sys: 2.47 s, total: 9min 7s
Wall time: 9min 8s


In [46]:
reg_train_preds, reg_test_preds = [x for x in zip(*reg_preds)]
reg_train_preds = np.exp(np.transpose(reg_train_preds))
reg_test_preds = np.exp(np.transpose(reg_test_preds))
target = np.exp(y)

In [47]:
#[ 0.39275028  0.01606502  0.60732489]

In [48]:
metalearner = Lasso(positive=True)
metalearner.fit(reg_train_preds, target);
print(metalearner.coef_)

[ 0.39393419  0.0464162   0.57729749]


In [49]:
full_preds = [reg.fit(X,y).predict(X_submission) for reg in regs]

In [64]:
meta_preds = cross_val_predict(metalearner, X, y, cv=10)

In [65]:
print('blended RMSE: {}'.format(rmse(y, np.log(meta_preds))))

blended RMSE: 9.54547092273


In [67]:
meta_preds = oob_predictions(metalearner, reg_train_preds, target, reg_test_preds, n_folds=10)
rmse(y, np.log(meta_preds[0]))


RMSE: 19560.4617677


0.10817783184163307

In [69]:
pd.DataFrame({"id": ids_submission, "SalePrice": meta_preds[1]})\
  .to_csv("metalearner_full_submission_4.csv", index = False)

In [None]:


metalearner reg_train_preds, target)

In [52]:
# weights LASSO-0.9172 EN-0.0002 XGB-0.0909
# blended RMSE: 0.110123029094 - actual RMSE 0.11460
# blended RMSE: 0.109102465079 - actual RMSE 0.11586
# blended RMSE: 0.108305024878 - actual RMSE 0.11429
print('blended RMSE: {}'.format(rmse(y, np.log(metalearner.predict(reg_train_preds)))))

blended RMSE: 0.107699283544


In [None]:
#final_result = 0.50*lasso_preds + 0.24*xgb_preds+0.26*elas_preds
#solution = pd.DataFrame({"id":ids_submission, "SalePrice":final_result}, columns=['id', 'SalePrice'])
#solution.to_csv("blended_submission.csv", index = False)

In [92]:
pd.DataFrame({"id":ids_submission, "SalePrice":metalearner.predict(reg_test_preds)})\
  .to_csv("metalearner_submission_4.csv", index = False)

In [50]:
pd.DataFrame({"id": ids_submission, "SalePrice": metalearner.predict(np.exp(np.transpose(full_preds)))})\
  .to_csv("metalearner_full_submission_1.csv", index = False)

In [None]:
#0.11443