In [1]:
import torch

import numpy as np
import pandas as pd

from lib.models import BaseLearnerRegression, cyclic_cosine_annealing_lr
from sklearn.model_selection import KFold
from tqdm import tqdm
from lib.utils import check_dir
import copy
import time

In [2]:
ITERATIONS = 10
K_SPLITS = 3
N_ESTIMATORS = 25
ANNEALING = 10

In [3]:
class DataSamplingEnsemble():
    def __init__(self, max_samples=1.0, max_features=1.0, bootstrap=False, snapshot=False, stacking=False):
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.learners = []
        self.learners_features = []
        self.snapshot = snapshot
        self.snapshot_learners = []
        self.ncl_lambda = None
        self.stacking = stacking

    def add_learner(self, n_learners, input_size, hidden_sizes, activation, loss_function, dropout_rates=None, lr=0.001, annealing=None, ncl_lambda=None):
        self.ncl_lambda = ncl_lambda
        for i in range(n_learners):
            self.learners.append(BaseLearnerRegression(int(input_size * self.max_features), hidden_sizes, activation, loss_function, dropout_rates, lr, annealing, ncl_lambda))

    def setup_data_loaders(self, df_train_set):
        for learner in self.learners:
            _df_train_set = df_train_set.copy()
            # Sampling features
            if self.max_features < 1.0:
                n_selected_features = int(len(_df_train_set.drop('target', axis=1).columns) * self.max_features)
                _df_train_set = _df_train_set.drop('target', axis=1).sample(n_selected_features, axis=1)
                _df_train_set['target'] = df_train_set['target']

            # Sampling samples
            _df_train_set = _df_train_set.sample(frac=self.max_samples, replace=self.bootstrap)
            
            # Setup data loaders        
            learner.setup_data_loaders(_df_train_set)
            self.learners_features.append(_df_train_set.drop(columns=['target']).columns)

    def train(self, epochs):
        if self.snapshot:
            for learner in self.learners:
                self.snapshot_learners.append([])
        
        if not self.snapshot:
            if epochs < ANNEALING:
                print(f'WARNING: Epochs ({epochs}) is less than annealing ({ANNEALING}) for a non-snapshot ensemble.')

        for epoch in range(epochs):
            # Calculating consensus for NCL
            consensus = None
            if self.ncl_lambda != None:
                predictions = []
                for i, learner in enumerate(self.learners):
                    predictions.append(learner.predict(learner.train_data_x))
                consensus = np.array(predictions).mean(axis=0)

            # Training
            for i, learner in enumerate(self.learners):
                # Adjusting learning rate
                lr_update = cyclic_cosine_annealing_lr(learner.lr, ANNEALING, 0, epoch)

                # Training for one epoch
                learner.train(1, consensus=consensus, lr_update=lr_update)

                # Saving snapshot
                if self.snapshot:
                    if ((epoch % ANNEALING) == (ANNEALING - 1)) and (epoch != 0) and (epoch != (epochs - 1)):
                        self.snapshot_learners[i].append(copy.deepcopy(learner))
        
        if self.snapshot:
            self.snapshot_learners[i].append(copy.deepcopy(learner))

        if self.stacking:
            base_models = self.learners
            if self.snapshot:
                base_models = []
                for i in range(len(self.snapshot_learners)):
                    base_models += self.snapshot_learners[i]
            self.stacking_model = BaseLearnerRegression(input_size=len(base_models), hidden_sizes=[len(base_models)], activation='relu', loss_function=torch.nn.MSELoss(), lr=0.02)

            predictions = []
            for i, learner in enumerate(base_models):
                predictions.append(learner.predict(learner.train_data_x))

            df_predictions = pd.DataFrame(predictions).T
            df_predictions['target'] = learner.train_data_y.numpy()
            self.stacking_model.setup_data_loaders(df_predictions)

            self.stacking_model.train(50)

    def predict(self, df_test_set):
        # Predict class labels
        predictions = []
        for i in range(len(self.learners)):
            if self.snapshot:
                predictions = []
                for snapshot_learner in self.snapshot_learners[i]:
                    predictions.append(snapshot_learner.predict(df_test_set[self.learners_features[i]]))
                #predictions.append(np.array(snapshot_predictions).mean(axis=0))
            else:
                predictions.append(self.learners[i].predict(df_test_set[self.learners_features[i]]))
        
        df_predictions = pd.DataFrame(predictions, index=[f'base_learner_{i}' for i in range(len(predictions))]).T

        # Stacking
        if self.stacking:
            stacked_predictions = self.stacking_model.predict(df_predictions)
            return stacked_predictions, df_predictions
        
        mean_prediction = np.array(predictions).mean(axis=0)
        return mean_prediction, df_predictions

In [4]:
df_parameters = pd.read_csv('results/parameter_search_results.csv')
parameters = df_parameters[df_parameters.value == df_parameters.value.min()].iloc[0]
for i in range(4):
    print(f'm{i}', parameters['params_m' + str(i) + '_n_units_l1'], parameters['params_m' + str(i) + '_n_units_l2'], parameters['params_m' + str(i) + '_activation'], parameters['params_m' + str(i) + '_lr'])

m0 256 16 relu 0.020237275398132
m1 16 32 tanh 0.0692863240945767
m2 128 64 relu 0.0303691484592233
m3 32 32 tanh 0.0948870733809866


In [5]:
df_selected_model = pd.read_csv(f'results/trials/{parameters.number}.csv')
df_selected_model.pivot(index='dataset', columns='fold', values='model').sample(10)

fold,0,1,2
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
data/train-datasets/geographical_origin_of_music.csv,m0_,m2_,m2_
data/train-datasets/health_insurance.csv,m2_,m2_,m0_
data/train-datasets/Moneyball.csv,m0_,m2_,m2_
data/train-datasets/solar_flare.csv,m1_,m2_,m1_
data/train-datasets/abalone.csv,m1_,m2_,m2_
data/train-datasets/cps88wages.csv,m2_,m0_,m0_
data/train-datasets/california_housing.csv,m2_,m1_,m0_
data/train-datasets/socmob.csv,m2_,m2_,m2_
data/train-datasets/white_wine.csv,m0_,m0_,m2_
data/train-datasets/energy_efficiency.csv,m3_,m2_,m0_


In [9]:
pd.concat([df_selected_model.pivot(index='dataset', columns='fold', values='model')[0], df_selected_model.pivot(index='dataset', columns='fold', values='model')[1], df_selected_model.pivot(index='dataset', columns='fold', values='model')[2]]).value_counts()

m2_    50
m0_    39
m1_     9
m3_     7
Name: count, dtype: int64

In [None]:
class NegativeCorrelationLoss(torch.nn.Module):
    def __init__(self):
        super(NegativeCorrelationLoss, self).__init__()

    def forward(self, outputs, labels, ncl_lambda, consensus):
        consensus = torch.tensor(consensus, dtype=torch.float)
        loss = 0.5 * torch.nn.functional.mse_loss(outputs, labels) 
        loss -= ncl_lambda * torch.nn.functional.mse_loss(outputs, consensus)
        return loss

In [None]:
def run_data_sampling_ensemble(df_parameters, ensemble_name, iterations, k_splits, n_estimators, max_samples=1.0, max_features=1.0, bootstrap=False, dropout_rates=None, snapshot=False, ncl_lambda=None, stacking=False):
    logging_info = []
    j = -1
    for i in tqdm(range(iterations)):
        for fname in df_selected_model.dataset.unique():
            j += 1
            start_timer = time.time()

            df = pd.read_csv(fname)
            input_size = df.shape[1] - 1

            # Divide the data into 3 folds
            kf = KFold(n_splits=k_splits, shuffle=True, random_state=42)

            # Define the hyperparameters to optimize
            fold_id = 0
            predictions = []
            base_predictions = []
            for train_idx, test_idx in kf.split(df):
                test = df_selected_model[(df_selected_model.dataset == fname) & (df_selected_model.fold == fold_id)].iloc[0]
                bmodel = 'params_' + test.model

                df_train_set = df.iloc[train_idx].copy()

                ensemble = DataSamplingEnsemble(max_samples=max_samples, max_features=max_features, bootstrap=bootstrap, snapshot=snapshot, stacking=stacking)

                loss_function = torch.nn.MSELoss()
                if ncl_lambda != None:
                    loss_function = NegativeCorrelationLoss()

                if snapshot:
                    ensemble.add_learner(
                        1, 
                        input_size, 
                        [parameters[bmodel + 'n_units_l1'], parameters[bmodel + 'n_units_l2']], 
                        parameters[bmodel + 'activation'],
                        loss_function, 
                        dropout_rates, 
                        parameters[bmodel + 'lr'],
                        ANNEALING,
                        ncl_lambda)

                    ensemble.setup_data_loaders(df_train_set)
                    ensemble.train(epochs=10*n_estimators)

                else:
                    ensemble.add_learner(
                        n_estimators, 
                        input_size, 
                        [parameters[bmodel + 'n_units_l1'], parameters[bmodel + 'n_units_l2']], 
                        parameters[bmodel + 'activation'],
                        loss_function, 
                        dropout_rates, 
                        parameters[bmodel + 'lr'],
                        ANNEALING,
                        ncl_lambda)
                    
                    ensemble.setup_data_loaders(df_train_set)
                    ensemble.train(epochs=10)

                pred, base_pred = ensemble.predict(df.iloc[test_idx].copy())
                predictions.append(pred)
                base_predictions.append(base_pred)

                fold_id += 1
            
            df['pred'] = np.nan
            for k in range(base_pred.shape[1]):
                df[f'base_learner_{k}'] = np.nan
                if k % 25 == 0:
                    df = df.copy()
            for train_idx, test_idx in kf.split(df):
                df.loc[test_idx, 'pred'] = predictions.pop(0)
                base_predictions_pop = base_predictions.pop(0)
                for k in range(base_pred.shape[1]):
                    df.loc[test_idx, f'base_learner_{k}'] = base_predictions_pop[f'base_learner_{k}'].values

            check_dir(f'results/ensemble/regression/{ensemble_name}')

            df[['target', 'pred'] + [f'base_learner_{i}' for i in range(base_pred.shape[1])]].to_csv(f'results/ensemble/regression/{ensemble_name}/{fname.split("/")[-1].split(".")[0]}_{i}.csv', index=False)

            logging_info.append({'test_id': j, 'name': ensemble_name, 'n_estimators': n_estimators, 'max_samples': max_samples, 'max_features': max_features, 'bootstrap': bootstrap, 'dropout_rates': dropout_rates, 'snapshot': snapshot, 'ncl_lambda': ncl_lambda, 'stacking': stacking, 'accuracy': (df['target'] == df['pred']).mean(), 'time': time.time() - start_timer})
            pd.DataFrame(logging_info).to_csv(f'results/ensemble/regression/training_logging.csv', index=False)

# Testing single model

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name='single_model',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=1)

# Lv0

## Testing Ensembles with 10 Estimators (Lv0)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'simple_average-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'pasting-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_samples=0.7)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           dropout_rates=[0.2, 0.2])

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'snapshot-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           stacking=True)

# Lv1

## Testing Ensembles with 25 Estimators 

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-random_subspaces-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           max_features=0.7)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-pasting-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           max_samples=0.7)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-dropout-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           dropout_rates=[0.2, 0.2])

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-snapshot-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'bagging-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           bootstrap=True,
                           stacking=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-pasting-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7,
                           max_samples=0.7)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-dropout-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7,
                           dropout_rates=[0.2, 0.2])

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-snapshot-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7,
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7,
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'random_subspaces-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_features=0.7,
                           stacking=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'pasting-dropout-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_samples=0.7,
                           dropout_rates=[0.2, 0.2])

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'pasting-snapshot-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_samples=0.7,
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'pasting-negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_samples=0.7,
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'pasting-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           max_samples=0.7,
                           stacking=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-snapshot-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           dropout_rates=[0.2, 0.2],
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           dropout_rates=[0.2, 0.2],
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           dropout_rates=[0.2, 0.2],
                           stacking=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'snapshot-negative_correlation_learning-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           snapshot=True,
                           ncl_lambda=0.1)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'snapshot-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           snapshot=True,
                           stacking=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'negative_correlation_learning-stacking-{N_ESTIMATORS}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=N_ESTIMATORS,
                           ncl_lambda=0.1,
                           stacking=True)

# Sensitivity analysis

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-snapshot-{10}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=10,
                           dropout_rates=[0.2, 0.2],
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-snapshot-{100}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=100,
                           dropout_rates=[0.2, 0.2],
                           snapshot=True)

In [None]:
run_data_sampling_ensemble(df_parameters,
                           ensemble_name=f'dropout-snapshot-{200}',
                           iterations=ITERATIONS,
                           k_splits=K_SPLITS,
                           n_estimators=200,
                           dropout_rates=[0.2, 0.2],
                           snapshot=True)