In [1]:
# from data import *
from tools import train_kcv
from ensemble import train_ensemble

In [2]:
def show_top_weights(final_results, n_top=5):
    # Sort by mean_accuracy (top 5 accuracies)
    top_5_accuracies = sorted(final_results, key=lambda x: x['mean_accuracy'], reverse=True)[:n_top]

    # Sort by mean_log_loss (bottom 5 log-losses)
    bottom_5_loglosses = sorted(final_results, key=lambda x: x['mean_log_loss'])[:n_top]

    print(f"Top {n_top} Accuracies:")
    for i, result in enumerate(top_5_accuracies, 1):
        print(f"{i}. Weights: {[round(float(w), 2) for w in result['weights']]} | "
              f"Mean Accuracy: {result['mean_accuracy']:.3f} | "
              f"Std Accuracy: {result['std_accuracy']:.3f} | "
              f"Mean Log Loss: {result['mean_log_loss']:.3f} | "
              f"Std Log Loss: {result['std_log_loss']:.3f}")

    print(f"\nBottom {n_top} Log Losses:")
    for i, result in enumerate(bottom_5_loglosses, 1):
        print(f"{i}. Weights: {[round(float(w), 2) for w in result['weights']]} | "
              f"Mean Accuracy: {result['mean_accuracy']:.3f} | "
              f"Std Accuracy: {result['std_accuracy']:.3f} | "
              f"Mean Log Loss: {result['mean_log_loss']:.3f} | "
              f"Std Log Loss: {result['std_log_loss']:.3f}")


In [None]:
import json
with open('params_xgb.json', 'r') as file:
    xgb_params = json.load(file)

with open('params_rf.json', 'r') as file: 
    rf_params = json.load(file)

results, best_result = train_ensemble(
    X_train, y_train, 
    xgb_params=xgb_params, rf_params=rf_params,
    mean_type='arithmetic'
)

show_top_weights(results, n_top=100)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Model trainings successful, proceeding to weight combination search...
Weight combination search done.
Top 100 Accuracies:
1. Weights: [0.03, 0.03, 0.37, 0.57] | Mean Accuracy: 0.696 | Std Accuracy: 0.014 | Mean Log Loss: 0.718 | Std Log Loss: 0.009
2. Weights: [0.03, 0.07, 0.37, 0.53] | Mean Accuracy: 0.696 | Std Accuracy: 0.015 | Mean Log Loss: 0.717 | Std Log Loss: 0.009
3. Weights: [0.03, 0.13, 0.37, 0.47] | Mean Accuracy: 0.696 | Std Accuracy: 0.012 | Mean Log Loss: 0.716 | Std Log Loss: 0.009
4. Weights: [0.0, 0.1, 0.33, 0.57] | Mean Accuracy: 0.696 | Std Accuracy: 0.013 | Mean Log Loss: 0.720 | Std Log Loss: 0.009
5

In [5]:
best_result

{'weights': (np.float64(0.23),
  np.float64(0.23),
  np.float64(0.27),
  np.float64(0.27)),
 'mean_accuracy': np.float64(0.6925999999999999),
 'std_accuracy': np.float64(0.008309031231136361),
 'mean_log_loss': np.float64(0.7075902518505988),
 'std_log_loss': np.float64(0.008891453327921791)}

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter


def create_splits(X, y, train_size=0.8, bias_class=None, bias_ratio=0.6):
    """
    Creates train-validation splits with different class distributions.
    
    Parameters:
    - X: Feature DataFrame
    - y: Target labels (0, 1, 2) as a NumPy array
    - train_size: Proportion of data to be used for training
    - bias_class: Class to over-represent in training (None for balanced split)
    - bias_ratio: Ratio of bias_class in train set (if bias_class is set)
    
    Returns:
    - X_train, X_val, y_train, y_val: Train and validation splits
    """
    if bias_class is None:
        # Standard stratified split
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, stratify=y, random_state=42)
    else:
        # Split each class separately
        indices = np.arange(len(y))
        class_indices = {label: indices[y == label] for label in np.unique(y)}
        
        train_indices, val_indices = [], []
        for label, idx in class_indices.items():
            if label == bias_class:
                train_count = int(len(idx) * bias_ratio)
            else:
                train_count = int(len(idx) * (1 - bias_ratio) / 2)
            
            train_idx, val_idx = train_test_split(idx, train_size=train_count, random_state=42)
            train_indices.extend(train_idx)
            val_indices.extend(val_idx)
        
        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
    
    return X_train, X_val, y_train, y_val


In [20]:
from data import X_train as X 
from data import y_train as y

# 1. Balanced split
X_train1, X_val1, y_train1, y_val1 = create_splits(X, y)

# 2. More class 0 in train
X_train2, X_val2, y_train2, y_val2 = create_splits(X, y, bias_class=0, bias_ratio=0.6)

# 3. More class 1 in train
X_train3, X_val3, y_train3, y_val3 = create_splits(X, y, bias_class=1, bias_ratio=0.6)

# 4. More class 2 in train
X_train4, X_val4, y_train4, y_val4 = create_splits(X, y, bias_class=2, bias_ratio=0.6)

# Print class distributions
for i, (y_tr, y_v) in enumerate([(y_train1, y_val1), (y_train2, y_val2), (y_train3, y_val3), (y_train4, y_val4)], 1):
    print(f"Split {i}\n     Train distribution: {Counter(y_tr)}\n     Validation distribution: {Counter(y_v)}\n")


Split 1
     Train distribution: Counter({np.int64(2): 1631, np.int64(0): 1262, np.int64(1): 1107})
     Validation distribution: Counter({np.int64(2): 408, np.int64(0): 315, np.int64(1): 277})

Split 2
     Train distribution: Counter({np.int64(0): 946, np.int64(2): 407, np.int64(1): 276})
     Validation distribution: Counter({np.int64(2): 1632, np.int64(1): 1108, np.int64(0): 631})

Split 3
     Train distribution: Counter({np.int64(1): 830, np.int64(2): 407, np.int64(0): 315})
     Validation distribution: Counter({np.int64(2): 1632, np.int64(0): 1262, np.int64(1): 554})

Split 4
     Train distribution: Counter({np.int64(2): 1223, np.int64(0): 315, np.int64(1): 276})
     Validation distribution: Counter({np.int64(0): 1262, np.int64(1): 1108, np.int64(2): 816})



In [None]:
from base_models import Model
from ensemble import soft_voting, train_ensemble
from model_nn import scale_data

In [None]:
data = {
    1: (X_train1, X_val1, y_train1, y_val1),
    2: (X_train2, X_val2, y_train2, y_val2),
    3: (X_train3, X_val3, y_train3, y_val3),
    4: (X_train4, X_val4, y_train4, y_val4)
}

import json
with open('params_xgb.json', 'r') as file:
    xgb_params = json.load(file)

with open('params_rf.json', 'r') as file: 
    rf_params = json.load(file)

In [28]:
simulation_results = {}

for data_key in data:
    print(f"RUNNING ITERATION {data_key}") 
    _X_train, _X_val, _y_train, _y_val = data[data_key]
    results, best_result = train_ensemble(
        _X_train, _y_train, xgb_params=xgb_params, rf_params=rf_params, mean_type='arithmetic'
    )
    print(show_top_weights(results), "\n")

    simulation_results[data_key] = (results, best_result)

RUNNING ITERATION 1
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Model trainings successful, proceeding to weight combination search...
Weight combination search done.
Top 5 Accuracies:
1. Weights: [0.0, 0.2, 0.37, 0.43] | Mean Accuracy: 0.692 | Std Accuracy: 0.002 | Mean Log Loss: 0.721 | Std Log Loss: 0.011
2. Weights: [0.0, 0.17, 0.33, 0.5] | Mean Accuracy: 0.692 | Std Accuracy: 0.002 | Mean Log Loss: 0.723 | Std Log Loss: 0.011
3. Weights: [0.0, 0.2, 0.33, 0.47] | Mean Accuracy: 0.692 | Std Accuracy: 0.004 | Mean Log Loss: 0.723 | Std Log Loss: 0.011
4. Weights: [0.0, 0.2, 0.4, 0.4] | Mean Accuracy: 0.692 | Std Accuracy: 0.003 | Mean Log Loss: 0.720 | Std Log Los

In [44]:
from typing import List, Tuple 

class Ensemble: 
    def __init__(self, models: List[Model] = None, model_weights: np.ndarray = None):
        if models is None: 
            self.xgb_model = Model(model_type='xgb', xgb_params=xgb_params)
            self.gnb_model = Model(model_type='gnb', selected_features=['x2', 'x3', 'x4', 'x6', 'x8', 'x9', 'x10', 'x11'])
            self.nn_model = Model(model_type='nn', nn_params=X.shape[1]) # TODO: replace with correct parameter later
            self.rf_model = Model(model_type='rf', rf_params=rf_params)
        else: 
            self.models = models
        if model_weights is None: 
            self.model_weights = np.array([0.25, 0.25, 0.25, 0.25])
        else: 
            self.model_weights = model_weights


    def fit(self, X_train, y_train, X_val, y_val):
        X_train_scaled, X_val_scaled = scale_data(X_train, X_val)

        self.xgb_model.fit(X_train, y_train)
        self.gnb_model.fit(X_train, y_train)
        self.nn_model.fit(X_train_scaled, y_train, X_val=X_val_scaled, y_val=y_val, verbose=0)
        self.rf_model.fit(X_train, y_train)
    
    def predict(self, X_pred, weights=None, mean_type='arithmetic') -> Tuple[np.ndarray, np.ndarray]:
        _, X_pred_scaled = scale_data(X_pred, X_pred)
        model_predictions = [
            self.xgb_model.predict(X_pred)[1], 
            self.gnb_model.predict(X_pred)[1], 
            self.nn_model.predict(X_pred_scaled)[1], 
            self.rf_model.predict(X_pred)[1]
        ]
        weights = self.model_weights if weights is None else weights
        return soft_voting(
            list_X_preds=model_predictions, weights=weights, mean_type=mean_type
        )

In [46]:
from sklearn.metrics import accuracy_score, log_loss

simulation_2_results = {}

for data_key in data: 
    print(f"RUNNING ITERATION {data_key}") 
    _X_train, _X_val, _y_train, _y_val = data[data_key]
    ensemble = Ensemble(model_weights=simulation_results[data_key][1]['weights'])
    ensemble.fit(_X_train, _y_train, _X_val, _y_val)
    
    _y_val_pred, _y_val_proba = ensemble.predict(_X_val)
    accuracy = accuracy_score(_y_val, _y_val_pred)
    logloss = log_loss(_y_val, _y_val_proba)

    print(f"Accuracy: {accuracy:.6f} | Logloss: {logloss:.6f} \n")
    simulation_2_results[data_key] = {'accuracy': accuracy, 'logloss': logloss}

RUNNING ITERATION 1
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Accuracy: 0.699000 | Logloss: 0.705590 

RUNNING ITERATION 2
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 806us/step
Accuracy: 0.568081 | Logloss: 0.957822 

RUNNING ITERATION 3
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 780us/step
Accuracy: 0.531032 | Logloss: 0.902228 

RUNNING ITERATION 4
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 780us/step
Accuracy: 0.516635 | Logloss: 1.020847 



In [36]:
data_key = 1
simulation_results[data_key][1]['weights']

(np.float64(0.23), np.float64(0.23), np.float64(0.27), np.float64(0.27))