## Meta Ensemble

Now we are going to use the random forest classifiers trained above to create a meta predictor.

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
train.head()

In [None]:
X_train, Y_train = train.ix[:,:-1], train['TARGET']

In [None]:
X_train.head()

In [None]:
for rf in rfs:
    temp = rf.predict(train.ix[:,:-1])
    temp = pd.DataFrame(temp)
    X_train = pd.concat([X_train, temp], axis=1)

In [None]:
train_meta = pd.concat([X_train, train['TARGET']], axis=1)

In [None]:
train_meta.head()

In [None]:
X_test, Y_test = test.ix[:,:-1], test['TARGET']

In [None]:
for rf in rfs:
    temp = rf.predict(test.ix[:,:-1])
    temp = pd.DataFrame(temp)
    X_test = pd.concat([X_test, temp], axis=1)

In [None]:
Y_test = test['TARGET']

In [None]:
X_test.head()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# random regularization
cv = {}

In [None]:
for _ in range(20):
    n_trees = np.random.randint(50,300)
    lg = RandomForestClassifier(n_estimators = n_trees)
    lg.fit(X_train, Y_train)
    cv[n_trees] = roc_auc_score(Y_test, lg.predict(X_test))
    print(cv[n_trees])

In [None]:
for key in sorted(cv.keys(), key=cv.get)[::-1]:
    print(key,cv[key])

### Score analysis

In [None]:
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]
n = 100
a = 0.25
w = 1
N_forest = 5
n_trees = 5

In [None]:
scores = []
for _ in range(n):
    rfs = trainForests(train, a, w, N_forest,n_trees)
    Y_prob = mean_ensemble(rfs, X_test)
    scores.append(roc_auc_score(Y_test,Y_prob))
scores = pd.DataFrame(scores)

In [None]:
scores.describe()

In [None]:
plt.title("Distribution of scores")
plt.hist(scores)
plt.show()

In [None]:
# If desired, transform probabilities into class labels.
def threshold(Y_prob, threshold = 0.5):
    result = []
    for y in Y_prob:
        if y <= threshold:
            result.append(0)
        else:
            result.append(1)
    return result

In [None]:
# Evaluate class labels
Y_pred = threshold(Y_prob, threshold = 0.5)
_ = eval_classification(test['TARGET'],Y_pred, print_results = True)

In [None]:
# Plot feature importance
def plot_features(forest):  
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    n=len(indices)
    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(n), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(n), indices)
    plt.xlim([-1, n])
    plt.show()

In [None]:
train.head()

# Create Submission

In [None]:
# Retrain forest on the whole 'train.csv' data
rfs = trainForests(data, a, w, N_forest, n_trees)

In [None]:
test = pd.read_csv('data/test.csv')
test_id = test.ix[:,'ID'].values
process(test)

In [None]:
Y_prob = mean_ensemble(rfs,test)

In [None]:
create_submission(test_id, Y_prob['geometric'])

## Ensemble RF and XGBOOST

In [None]:
Y_boost = pd.read_csv('../Kaggle_Santander-master/simplexgbtest.csv')

In [None]:
Y_boost.head()

In [None]:
Y_rf = pd.read_csv('submissions/rforest_ensemble2.csv')

In [None]:
Y_rf.head()

In [None]:
Y_prob = pd.concat([Y_boost,Y_rf.ix[:,'TARGET']], axis=1, ignore_index=True)

In [None]:
Y_prob.rename(columns ={0:'ID', 1:'xgb', 2: 'rfe' }, inplace = True)

In [None]:
# geometric mean ensemble
l = 2 #number of predictors to ensemble
temp = Y_prob.ix[:,1:].product(axis=1)
temp = temp.apply(lambda x: np.power(x, 1./l))
Y_prob['geometric'] = temp

In [None]:
# arithmetic mean ensemble
l = 2 #number of predictors to ensemble
temp = Y_prob[['xgb', 'rfe']].mean(axis=1)
temp = temp.apply(lambda x: np.power(x, 1./l))
Y_prob['arithmetic'] = temp

In [None]:
# difference column
temp = Y_prob['xgb'] - Y_prob['rfe']
Y_prob['xgb - rfe'] = temp

In [None]:
# difference column
temp = Y_prob['geometric'] - Y_prob['arithmetic']
Y_prob['geo - ari'] = temp

In [None]:
Y_prob.head()

In [None]:
plt.title('Differences between XGB and RFE')
plt.hist(Y_prob['xgb - rfe'])
plt.show()

In [None]:
plt.title('Differences between ensembles')
plt.hist(Y_prob['geo - ari'])
plt.show()

In [None]:
create_submission(test_id, Y_prob['arithmetic'])