In [2]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import os

os.system("ls ./")

train = pd.read_csv("./train.csv")
print train.shape
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))

labels = train['target']
train.drop(['target', 'id'], axis=1, inplace=True)


#print("head result: "+str(train.head()))

### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
for train_index, test_index in sss:
    break

train_x, train_y = train.values[train_index], labels.values[train_index]
test_x, test_y = train.values[test_index], labels.values[test_index]


### building the classifiers
clfs = []

rfc = RandomForestClassifier(n_estimators=20, random_state=4141, n_jobs=-1)
rfc.fit(train_x, train_y)
print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
clfs.append(rfc)

### usually you'd use xgboost and neural nets here

logreg = LogisticRegression()
logreg.fit(train_x, train_y)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
clfs.append(logreg)

(61878, 95)
Training set has 61878 rows and 95 columns
RFC LogLoss 0.952919762767
LogisticRegression LogLoss 0.672480385582


In [7]:

### finding the optimum weights

predictions = []
for clf in clfs:
    predictions.append(clf.predict_proba(test_x))
    print 'the predictions size: ', len(predictions)
    print 'first row size: ', len(predictions[0])
    print 'element type: ', type(predictions[0])

temp = np.asarray(predictions)
print 'full shape: ', temp.shape

def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    #print 'the weights: ', weights
    for weight, prediction in zip(weights, predictions):
        print weight, prediction.shape
        #print len(prediction)
        final_prediction += weight*prediction
        #print final_prediction.shape
    return log_loss(test_y, final_prediction)

#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
#print starting_values
#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
#print type(starting_values)
res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))


the predictions size:  1
first row size:  3094
element type:  <type 'numpy.ndarray'>
the predictions size:  2
first row size:  3094
element type:  <type 'numpy.ndarray'>
full shape:  (2, 3094, 9)
0.5 (3094, 9)
0.5 (3094, 9)
0.5 (3094, 9)
0.5 (3094, 9)
0.500000014901 (3094, 9)
0.5 (3094, 9)
0.5 (3094, 9)
0.500000014901 (3094, 9)
0.524485822767 (3094, 9)
0.475514177233 (3094, 9)
0.524485822767 (3094, 9)
0.475514177233 (3094, 9)
0.524485837668 (3094, 9)
0.475514177233 (3094, 9)
0.524485822767 (3094, 9)
0.475514192134 (3094, 9)
0.630896281451 (3094, 9)
0.369103718549 (3094, 9)
0.630896281451 (3094, 9)
0.369103718549 (3094, 9)
0.630896296352 (3094, 9)
0.369103718549 (3094, 9)
0.630896281451 (3094, 9)
0.36910373345 (3094, 9)
0.681274946234 (3094, 9)
0.318725053766 (3094, 9)
0.681274946234 (3094, 9)
0.318725053766 (3094, 9)
0.681274961135 (3094, 9)
0.318725053766 (3094, 9)
0.681274946234 (3094, 9)
0.318725068667 (3094, 9)
0.676185558958 (3094, 9)
0.323814441042 (3094, 9)
0.676185558958 (3094,