In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

In [127]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from time import time
import random

In [2]:
data = pd.read_csv('train_data.csv')

In [70]:
X = data.drop('label', axis = 1)
y = data['label']

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [79]:
def generate_batch():
    batch_num = 5
    batch_size = X_train.shape[0] / batch_num
    X_batches = []
    y_batches = []
    for i in range(5):
        if i < 4:
            X_batches.append(X_train[i * batch_size : (i + 1) * batch_size])
            y_batches.append(y_train[i * batch_size : (i + 1) * batch_size])
        else:
            X_batches.append(X_train[i * batch_size : ])
            y_batches.append(y_train[i * batch_size : ])
    return X_batches, y_batches

In [131]:
def train_clf(X_batches, y_batches, clf_index):
    batch_num = len(X_batches)
    clfs = []
    outputs = pd.Series()
    for i in range(batch_num):
        clf = None
        X_current = pd.DataFrame()
        y_current = pd.Series()
        for j in range(batch_num):
            if j != i:
                X_current = X_current.append(X_batches[j], ignore_index=True)
                y_current = y_current.append(y_batches[j], ignore_index=False)

        if clf_index == 'LR':
            clf = LogisticRegression()
        elif clf_index == 'NN':
            clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30, 20), random_state=random.randint(1, 100))
        elif clf_index == 'RF':
            clf = RandomForestClassifier(n_estimators=20, random_state=random.randint(1, 100))
        elif clf_index == 'GBDT':
            clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=5, random_state=random.randint(1, 100))
        else:
            return 
        
        clf.fit(X_current, y_current)
        output = clf.predict_proba(X_batches[i])[:, 0]
        outputs = outputs.append(pd.Series(output), ignore_index=True)
        clfs.append(clf)    
    return clfs, outputs    

In [132]:
def stack_one(x_batches, y_batches):
    clfs = {
        'LR' : [],
        'NN' : [],
        'RF' : [],
        'GBDT' : [],
    }
    output = pd.DataFrame()
    for clf_index in clfs.keys():
        clfs[clf_index], output[clf_index] = train_clf(x_batches, y_batches, clf_index)
    return clfs, output

In [133]:
X_batches, y_batches = generate_batch()
clfs, X_mix = stack_one(X_batches, y_batches) 

In [135]:
X_mix.to_csv('mid_train.csv', index = False)

In [141]:
def stack_two(X_trian, y_train):
    clf = LogisticRegression()
    clf.fit(X_trian, y_train)
    return clf 

In [147]:
def test(clfs, clf_two, X_test, y_test):
    outputs = pd.DataFrame()
    for clf_index in clfs.keys():
        outputs[clf_index] = np.zeros(X_test.shape[0])
        for clf in clfs[clf_index]:
            outputs[clf_index] += clf.predict_proba(X_test)[:, 0]
        outputs[clf_index] /= len(clfs[clf_index])
    res = clf_two.predict_proba(outputs)
    print clf_two.score(outputs, y_test)
    return res

In [143]:
clf_two = stack_two(X_mix, y_train) 

In [148]:
res = test(clfs, clf_two, X_test, y_test)

0.805881916446


In [150]:
from sklearn.metrics import log_loss
log_loss(y_test, res)

0.40862443709323809