In [3]:
from sklearn.metrics import log_loss
import numpy as np

from random_forest import RandomForest
from extra_trees import ExtraTrees
from logistic_regression import LogisticRegression
from stacked_generalization import StackedGeneralization
from generalizer import Generalizer
from sklearn import datasets

In [4]:
def load_bio_data():
    raw_train = np.loadtxt('train.csv', delimiter=',', skiprows=1)
    train_target = raw_train[:, 0]
    train_data = raw_train[:, 1:]
    test_data = np.loadtxt('test.csv', delimiter=',', skiprows=1)
    return train_data, train_target, test_data

In [5]:
def load_iris_data():
    iris = datasets.load_iris()
    train_data = iris.data
    train_target = iris.target
    test_data = iris.data
    return train_data, train_target, test_data

In [6]:
def train_layer0(sg, generalizers, save_predictions=True):
    layer0_partition_guess = np.array([generalizer.guess_partial(sg) for generalizer in generalizers])
    
    for generalizer_index, generalizer in enumerate(generalizers):
        if save_predictions:
            Generalizer.save_partial(generalizer.name(), layer0_partition_guess[generalizer_index])
        print(f"log loss for {generalizer.name()}: {log_loss(sg.train_target, layer0_partition_guess[generalizer_index, :, :])}")
    
    layer0_whole_guess = np.array([generalizer.guess_whole(sg) for generalizer in generalizers])
    for generalizer_index, generalizer in enumerate(generalizers):
        if save_predictions:
            Generalizer.save_whole(generalizer.name(), layer0_whole_guess[generalizer_index])
    
    return layer0_partition_guess, layer0_whole_guess

In [7]:
def load_layer0(filenames):
    layer0_partial_guess = np.array([Generalizer.load_partial(filename) for filename in filenames])
    layer0_whole_guess = np.array([Generalizer.load_whole(filename) for filename in filenames])
    return layer0_partial_guess, layer0_whole_guess

In [12]:
def initialize_sg():
    n_splits = 3
    (train_data, train_target, test_data) = load_iris_data()
    return StackedGeneralization(n_splits, train_data, train_target, test_data)

In [15]:
sg = initialize_sg()
layer0_partition_guess, layer0_whole_guess = load_layer0(["random_forest", "extra_trees"])

result = LogisticRegression().guess(
np.hstack(layer0_partition_guess), 
sg.train_target, np.hstack(layer0_whole_guess))

id_columns = np.array(range(len(sg.test_data))) + 1

np.savetxt('predicted.csv', np.array([id_column, result[:, ]]).T,
          fmt='%d, %1.6f',
          header='MoleculeId, PredictedProbability',
          comments='')

ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=1.