In [37]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.svm import SVC
import os
from sklearn.datasets import make_moons, make_circles, make_classification

In [38]:
### Load some data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=12, 
                           n_redundant=2, n_repeated=0, n_classes=2)

In [39]:
labels = y

In [40]:
### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
for train_index, test_index in sss:
    break

In [41]:
train_x, train_y = X[train_index], labels[train_index]
test_x, test_y = X[test_index], labels[test_index]

In [42]:
### building the classifiers
### usually you'd use xgboost and neural nets here
clfs = []

rfc = SVC(probability=True)
rfc.fit(train_x, train_y)
print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
clfs.append(rfc)

logreg = LogisticRegression()
logreg.fit(train_x, train_y)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
clfs.append(logreg)

rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
rfc2.fit(train_x, train_y)
print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x))))
clfs.append(rfc2)

RFC LogLoss 0.296359097585
LogisticRegression LogLoss 0.41308029188
RFC2 LogLoss 0.285335822671


In [43]:
### finding the optimum weights
predictions = []
for clf in clfs:
    predictions.append(clf.predict_proba(test_x))

In [44]:
np.shape(predictions)

(3, 50, 2)

In [45]:
def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(test_y, final_prediction)

In [72]:
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = np.ones(len(predictions))/(len(predictions))

In [73]:
#adding constraints  and a different solver 
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})

#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
#res = minimize(log_loss_func, starting_values, method='SLSQP')

In [74]:
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensamble Score: 0.151442691802
Best Weights: [  8.82275657e-01   1.44361519e-16   1.17724343e-01]


In [70]:
## This will combine the model probabilities using the optimized weights
y_prob = 0
for i in range(len(predictions)):
    y_prob += predictions[i]*weights[i]

In [71]:
y_prob

array([[ 0.00941795,  0.99058205],
       [ 0.97138036,  0.02861964],
       [ 0.9734544 ,  0.0265456 ],
       [ 0.95119985,  0.04880015],
       [ 0.61103592,  0.38896408],
       [ 0.95900268,  0.04099732],
       [ 0.25670592,  0.74329408],
       [ 0.01177273,  0.98822727],
       [ 0.98242386,  0.01757614],
       [ 0.94457383,  0.05542617],
       [ 0.10904413,  0.89095587],
       [ 0.93310384,  0.06689616],
       [ 0.98074935,  0.01925065],
       [ 0.02354563,  0.97645437],
       [ 0.97545943,  0.02454057],
       [ 0.978021  ,  0.021979  ],
       [ 0.80774007,  0.19225993],
       [ 0.00235449,  0.99764551],
       [ 0.00707635,  0.99292365],
       [ 0.01412693,  0.98587307],
       [ 0.00596532,  0.99403468],
       [ 0.94926899,  0.05073101],
       [ 0.01412694,  0.98587306],
       [ 0.03768656,  0.96231344],
       [ 0.92341051,  0.07658949],
       [ 0.04238078,  0.95761922],
       [ 0.93754411,  0.06245589],
       [ 0.01648142,  0.98351858],
       [ 0.97990512,