## Ensembling our Different Models

### Initial Helper Functions

In [None]:
# From https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook/notebook

# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

### Out of Fold Predictions

In [19]:
# From https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python/notebook/notebook

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

### Import Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb


%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold

### Import Data

In [20]:
df = pd.read_csv('../EDA/cleaned_df_no_scaling_or_outlier_removal_2017-05-28.csv', parse_dates=['timestamp'], index_col=False, low_memory=False)

### Convert Categorical Data into Numerical

In [11]:
from sklearn import preprocessing

for feature in df.columns:
    if df[feature].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[feature].values)) 
        df[feature] = lbl.transform(list(df[feature].values))

### Features

In [6]:
features = ['timestamp', 'floor', 'full_sqrt', 'area_km', 'density','month','year','day',
                'rel_floor', 'material', 'age_imputed', 'num_room', 'work_share','kitch_sq',
                'state_imputed', 'product_type', 'sub_area', 'indust_part', 'sport_objects_raion', 'oil_chemistry_raion', 'metro_min_avto',
               'green_zone_km', 'industrial_km', 'kremlin_km', 'radiation_km', 'ts_km', 'fitness_km',
               'stadium_km', 'park_km', 'price_doc_log']

In [9]:
[feature in df.columns for feature in features]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

All features are in the dataframe. Let's take a look at the data:

In [17]:
np.sum(df['sub_area'].isnull())

0

### Missing Observations

In [18]:
for feature in features:
    if np.sum(df[feature].isnull()) > 0:
        print feature,np.sum(df[feature].isnull())

floor 169
full_sqrt 49
rel_floor 169
age_imputed 2929
state_imputed 2929
price_doc_log 7662
