In [1]:
from data import X_train as X 
from data import y_train as y

from tools import train_kcv, create_splits
from ensemble import Ensemble, train_ensemble, show_top_weights
from base_models import Model

from sklearn.metrics import accuracy_score, log_loss
from collections import Counter
import json

# Create Various Splits

In [2]:
# 1. Balanced split
X_train1, X_val1, y_train1, y_val1 = create_splits(X, y)

# 2. More class 0 in train
X_train2, X_val2, y_train2, y_val2 = create_splits(X, y, bias_class=0, bias_ratio=0.6)

# 3. More class 1 in train
X_train3, X_val3, y_train3, y_val3 = create_splits(X, y, bias_class=1, bias_ratio=0.6)

# 4. More class 2 in train
X_train4, X_val4, y_train4, y_val4 = create_splits(X, y, bias_class=2, bias_ratio=0.6)

# Print class distributions
for i, (y_tr, y_v) in enumerate([(y_train1, y_val1), (y_train2, y_val2), (y_train3, y_val3), (y_train4, y_val4)], 1):
    print(f"Split {i}\n     Train distribution: {Counter(y_tr)}\n     Validation distribution: {Counter(y_v)}\n")

Split 1
     Train distribution: Counter({np.int64(2): 1631, np.int64(0): 1262, np.int64(1): 1107})
     Validation distribution: Counter({np.int64(2): 408, np.int64(0): 315, np.int64(1): 277})

Split 2
     Train distribution: Counter({np.int64(0): 946, np.int64(2): 407, np.int64(1): 276})
     Validation distribution: Counter({np.int64(2): 1632, np.int64(1): 1108, np.int64(0): 631})

Split 3
     Train distribution: Counter({np.int64(1): 830, np.int64(2): 407, np.int64(0): 315})
     Validation distribution: Counter({np.int64(2): 1632, np.int64(0): 1262, np.int64(1): 554})

Split 4
     Train distribution: Counter({np.int64(2): 1223, np.int64(0): 315, np.int64(1): 276})
     Validation distribution: Counter({np.int64(0): 1262, np.int64(1): 1108, np.int64(2): 816})



In [3]:
data = {
    1: (X_train1, X_val1, y_train1, y_val1),
    2: (X_train2, X_val2, y_train2, y_val2),
    3: (X_train3, X_val3, y_train3, y_val3),
    4: (X_train4, X_val4, y_train4, y_val4)
}

with open('params_xgb.json', 'r') as file:
    xgb_params = json.load(file)
with open('params_rf.json', 'r') as file: 
    rf_params = json.load(file)
xgb_selected_features = ['x4', 'x8', 'x9', 'x10', 'x11']
gnb_selected_features = ['x2', 'x3', 'x4', 'x6', 'x8', 'x9', 'x10', 'x11']

In [21]:
display(data[1][0].shape)
display(data[1][1].shape)
display(data[1][2].shape)

(4000, 13)

(1000, 13)

(4000,)

## Train the ensemble
- For each data distribution, find the best weights for the ensemble learning model
- Store and save for next part

In [4]:
# CREATE PREDICTIONS
simulation_results = {}
for data_key in data:
    print(f"RUNNING ITERATION {data_key}") 
    _X_train, _X_val, _y_train, _y_val = data[data_key]

    _models = [
        Model(model_type='xgb', selected_features=xgb_selected_features, xgb_params=xgb_params),
        Model(model_type='gnb', selected_features=gnb_selected_features),
        Model(model_type='nn', nn_params=X.shape[1]),
        Model(model_type='rf', rf_params=rf_params)
    ]

    _results, _best_result = train_ensemble(
        _X_train, _y_train, 
        models = _models,
        k_fold_type = 'shuffle_split',
        mean_type = 'arithmetic'
    )
    show_top_weights(_results)
    print()

    simulation_results[data_key] = {
        'results': _results,
        'best_result': _best_result,
        'models': _models
    }

RUNNING ITERATION 1
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 584us/step
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 633us/step
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 950us/step
Model trainings successful, proceeding to weight combination search...
Weight combination search done.
Top 5 Accuracies:
1. Weights: [0.23, 0.23, 0.27, 0.27] | Mean Accuracy: 0.687 | Std Accuracy: 0.005 | Mean Log Loss: 0.715 | Std Log Loss: 0.006
2. Weights: [0.2, 0.27, 0.27, 0.27] | Mean Accuracy: 0.687 | Std Accuracy: 0.005 | Mean Log Loss: 0.716 | Std Log Loss: 0.006
3. Weights: [0.2, 0.2, 0.3, 0.3] | Mean Accuracy: 0.686 | Std Accuracy: 0.005 | Mean Log Loss: 0.716 | Std Log Loss: 0.007
4. Weights: [0.2, 0.23, 0.27, 0.3] | Mean Accuracy: 0.686 | Std Accuracy: 0.005 | Mean Log Loss: 0.717 | Std

In [5]:
filtered_results = {
    key: {k: v for k, v in value.items() if k != "models"}
    for key, value in simulation_results.items()
}
with open("results_simulation_1.json", "w") as outfile: 
    json.dump(filtered_results, outfile)

## Check Accuracy and Log-Loss

In [None]:
# with open("results_simulation_1.json", 'r') as file:
#     results = json.load(file)

In [15]:
simulation_2_results = {}

for data_key in data: 
    print(f"RUNNING ITERATION {data_key}") 
    _X_train, _X_val, _y_train, _y_val = data[data_key]
    
    ensemble = Ensemble(
        models=simulation_results[data_key]['models'],
        model_weights=simulation_results[data_key]['best_result']['weights']
    )
    ensemble.fit(_X_train, _y_train, _X_val, _y_val)
    
    _y_val_pred, _y_val_proba = ensemble.predict(_X_val)
    accuracy = accuracy_score(_y_val, _y_val_pred)
    logloss = log_loss(_y_val, _y_val_proba)

    print(f"Accuracy: {accuracy:.6f} | Logloss: {logloss:.6f} \n")
    simulation_2_results[data_key] = {'accuracy': accuracy, 'logloss': logloss}

RUNNING ITERATION 1
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Accuracy: 0.700000 | Logloss: 0.699940 

RUNNING ITERATION 2
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step
Accuracy: 0.566894 | Logloss: 0.957540 

RUNNING ITERATION 3
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 833us/step
Accuracy: 0.531323 | Logloss: 0.899074 

RUNNING ITERATION 4
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 691us/step
Accuracy: 0.532643 | Logloss: 1.012205 

