In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import svm
import lightgbm as lgb
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from scipy import stats
import xgboost as xgb
from scipy.stats import norm
#from pyglmnet import GLM # Marco: need to understand how to install this 
from sklearn.preprocessing import StandardScaler
from subprocess import call
from sklearn.cross_validation import KFold
#from sklearn.model_selection import KFold

from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')



In [2]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# Create Ensemble class

In [3]:
class Ensemble(object):
    def __init__(self, n_folds, base_models,seed_value):
        self.n_folds = n_folds
        self.base_models = base_models
        self.seed_value = seed_value
        
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        # Let's define the folds based on the length of y and number of folds requested
        
        folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=self.seed_value))
        #folds = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.seed_value)
        
        print("folds=",folds)
        
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, reg in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], len(folds)))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                
                # a part of the training set is held out
                X_holdout = X[test_idx]
                # y_holdout = y[test_idx]
                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_holdout)[:]
                # Now the predicted values on the heldout set is used as the training set for the stacker model
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = reg.predict(T)[:]
            S_test[:, i] = S_test_i.mean(1)
        
        '''
        # Cross validate the stacker model
        stackermodel = xgb.XGBRegressor()

        # dict with tunning parameters
        param_grid = {
        'max_depth': [2, 4], 
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        'min_child_weight': range(1, 10, 2),
        'n_estimators': range(50, 300, 50),
        'objective': ['reg:linear']
        }

        #kfold = KFold(n_splits=nfold, random_state=seed)

        scorer = make_scorer(rmse, greater_is_better=False)
        grid_search = GridSearchCV(stackermodel, param_grid, n_jobs=-1, cv=5, verbose=1, scoring=scorer)
        grid_result = grid_search.fit(S_train, y)

        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']

        for mean, stdev, param in zip(means, stds, params):
        #print("%f (%f) with: %r" % (mean, stdev, param))
            print("{:06.5f} ({:06.5f}) with {}".format(mean, stdev, param))

        # summarize results
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        '''
        # Now fit the stacker model
        stackermodel = xgb.XGBRegressor(n_estimators=100,learning_rate=0.1,max_depth=2,min_child_weight=5,objective='reg:linear')
        stackermodel.fit(S_train, y)
        y_pred = stackermodel.predict(S_test)[:]
        return y_pred


# Read in the dataset

In [4]:
    
seed = 2017
nfold = 5
bmodels = ["elasticnet","et","lgb","rf","xgb"]
    
train = pd.read_csv("./data/X_train_v2.csv")
y = train['SalePrice']
X = train.loc[:,'MSSubClass':'SaleCondition_Partial']
    
test = pd.read_csv("./data/X_test_v2.csv")
id = test["Id"]
T = test.loc[:,'MSSubClass':'SaleCondition_Partial']
    
#Set the base models

base_models_name = []
for j in range(len(bmodels)):
        modelname = ("./models/single/model_" + bmodels[j] + ".py")
        print(modelname)
        base_models_name.append(modelname)

print(base_models_name)
base_models = []
    
for i, bm in enumerate(base_models_name):
        model = !grep "model =" {bm}
        model = model[0]
        model = model[12:]
        model = eval(model)
        base_models.append(model)
#print(model)
#print(base_models) 

./models/single/model_elasticnet.py
./models/single/model_et.py
./models/single/model_lgb.py
./models/single/model_rf.py
./models/single/model_xgb.py
['./models/single/model_elasticnet.py', './models/single/model_et.py', './models/single/model_lgb.py', './models/single/model_rf.py', './models/single/model_xgb.py']


In [5]:
#xgb = xgb.XGBRegressor(n_estimators=250,learning_rate=0.1,max_depth=4,min_child_weight=1,objective='reg:linear')
# Call stacking
    
ens = Ensemble(n_folds=nfold, base_models=base_models,seed_value=seed)

In [6]:
results = ens.fit_predict(X,y,T)

folds= [(array([   0,    1,    2, ..., 1436, 1437, 1438]), array([  17,   20,   25,   41,   54,   55,   56,   57,   61,   64,   93,
        100,  102,  114,  115,  116,  117,  119,  120,  123,  128,  160,
        166,  168,  179,  183,  188,  189,  191,  192,  196,  213,  216,
        222,  230,  232,  235,  238,  239,  243,  257,  261,  262,  263,
        270,  271,  273,  277,  282,  296,  299,  300,  313,  317,  319,
        331,  332,  333,  350,  353,  356,  358,  364,  371,  374,  375,
        377,  379,  383,  389,  391,  397,  401,  411,  413,  414,  416,
        421,  423,  426,  433,  434,  435,  436,  441,  443,  449,  451,
        457,  460,  461,  462,  463,  464,  465,  478,  480,  481,  484,
        487,  492,  497,  499,  505,  510,  522,  530,  534,  536,  539,
        544,  546,  554,  555,  556,  557,  570,  571,  575,  577,  601,
        609,  617,  623,  636,  639,  642,  649,  650,  651,  652,  657,
        668,  679,  681,  684,  692,  698,  701,  714,  717,  718



In [8]:
print("results=",results)
results = np.expm1(results)

results= [ 11.71511173  11.99144363  12.14565182 ...,  12.01550674  11.69173813
  12.33989429]


In [9]:
# Now prepare the final results for submission
print("results=",results)
results = pd.DataFrame(results)
results = pd.concat([id,results], axis=1)
results.columns =["Id","SalePrice"]
print(results.head())
results.to_csv("./ensembled_results.csv", index = False)

results= [ 122406.6015625  161367.140625   188272.65625   ...,  165297.25
  119578.671875   228636.78125  ]
     Id      SalePrice
0  1461  122406.601562
1  1462  161367.140625
2  1463  188272.656250
3  1464  195541.812500
4  1465  185674.156250
