In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from scipy.stats import hmean
from tqdm import tnrange, tqdm_notebook, tqdm
from xgboost import XGBRegressor
import forum_features



## scoring methods

In [2]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [3]:
def oob_predictions(model, X, y, X_sub, n_folds=5):
    folds = cv=KFold(n_folds, shuffle=True, random_state=42).split(X)
    
    if type(X).__name__ == 'SparseDataFrame':
        X = X.values
        X_sub = X_sub.values
        
    if type(y).__name__ == 'Series':
        y = y.values
    
    train_pred = np.zeros(np.shape(X)[0])
    test_pred_i = np.zeros((np.shape(X_sub)[0], n_folds))
        
    for i in tnrange(n_folds, desc='split'):
        train_index, test_index = folds.next()
        X_train = X[train_index,:]
        X_test = X[test_index,:]
        y_train = y[train_index]
        model.fit(X_train, y_train)
        train_pred[test_index] = model.predict(X_test)
        test_pred_i[:,i] = model.predict(X_sub)
        
    test_pred = hmean(test_pred_i, axis=1)
    
    print('RMSE: {}'.format(rmse(y, train_pred)))
    
    return train_pred, test_pred

## load data

In [4]:
X, y, X_submission, ids_submission = forum_features.load_data()

## stack results

In [5]:
regs = [BaggingRegressor(Lasso(alpha=0.00013, max_iter=10000),
                         random_state=1337, n_estimators=100, oob_score=True, 
                         max_samples=0.4, max_features=1.0),
        XGBRegressor(colsample_bytree=0.4, gamma=0.04, learning_rate=0.05, max_depth=16,
                     min_child_weight=3, n_estimators=1500, reg_alpha=0.65,
                     reg_lambda=0.5, subsample=0.95),
        make_pipeline(StandardScaler(copy=True, with_mean=False, with_std=False),
                      SVR(C=5.0, cache_size=512, coef0=0.0, degree=1,
                          epsilon=0.0435, gamma=0.00115, kernel='rbf',
                          max_iter=166519567.0, shrinking=True, 
                          tol=0.0016221625196, verbose=False))]

In [6]:
%%time
reg_preds = [oob_predictions(reg, X, y, X_submission, 10) for reg in regs]


RMSE: 0.109832474852

RMSE: 0.120812359806

RMSE: 0.108641420758
CPU times: user 9min 4s, sys: 2.56 s, total: 9min 7s
Wall time: 9min 8s


In [7]:
reg_train_preds, reg_test_preds = [x for x in zip(*reg_preds)]
reg_train_preds = np.exp(np.transpose(reg_train_preds))
reg_test_preds = np.exp(np.transpose(reg_test_preds))
target = np.exp(y)

## blend results

In [8]:
metalearner = Lasso(positive=True)
metalearner.fit(reg_train_preds, target);
print(metalearner.coef_)

[ 0.39393419  0.0464162   0.57729749]


In [9]:
meta_preds = oob_predictions(metalearner, reg_train_preds, target, reg_test_preds, n_folds=10)
print("RMSE: {}".format(rmse(y, np.log(meta_preds[0]))))


RMSE: 19560.4617677
RMSE: 0.108177831842


In [10]:
pd.DataFrame({"id": ids_submission, "SalePrice": meta_preds[1]})\
  .to_csv("metalearner_submission.csv", index = False)