In [1]:
import pandas
import numpy as np
from sklearn import cross_validation
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split, KFold
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values




In [2]:
class Ensemble(object):
    def __init__(self, n_folds, base_models, stacker):
        self.n_folds = n_folds
        self.base_models = base_models
        self.stacker = stacker
    def predict(self, X):
        test = np.zeros((X.shape[0], len(self.base_models)))
        
        for i , clf in enumerate(self.base_models):
            test[:, i] = clf.predict(X)
            
        return self.stacker.predict(test)
    
    def fit_predict(self, train_X, train_y, test_X):
        train_X = np.array(train_X)
        train_y = np.array(train_y)
        test_X = np.array(test_X)
        
        folds = list(KFold(len(train_y), n_folds = self.n_folds, shuffle=True, random_state=2016))
        
        S_train = np.zeros((train_X.shape[0], len(self.base_models)))
        S_test = np.zeros((test_X.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((test_X.shape[0], len(folds)))
            
            for j, (train_idx, cv_idx) in enumerate(folds):
                X_train = train_X[train_idx]
                y_train = train_y[train_idx]
                X_cv = train_X[cv_idx]
                
                clf.fit(X_train, y_train)
                
                y_pred = clf.predict(X_cv)
                S_train[cv_idx, i] = y_pred
                
                S_test_i[:, j] = clf.predict(test_X)
            S_test[:, i] = S_test_i.mean(1)
        
        self.stacker.fit(S_train, train_y)
        
        y_pred = self.stacker.predict(S_test)
        
        return y_pred

In [3]:
clfs = [
        RandomForestClassifier(n_estimators=300, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=300, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=300, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)
       ]

stacker = LogisticRegression()
n_folds = 10

X = array[:,0:8]
y = array[:,8]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.1) 
en = Ensemble(n_folds, clfs, stacker)

y_test_predict = en.fit_predict(train_X, train_y, test_X)
score = metrics.accuracy_score(test_y, y_test_predict)
print score

0.753246753247
