In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor
from itertools import combinations
from diversity_measures import generalized_diversity, entropy_measure, KW_variance, ia_measure, difficulty_measure
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, recall_score, precision_score, balanced_accuracy_score

In [2]:
data = pd.read_csv('adult-income.csv')
data.replace(['?'], np.nan, inplace=True)
data.income = data.income == '>50K'

In [3]:
train_data = data.sample(frac=0.7, random_state=123)
test_data = data.drop(train_data.index)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [4]:
predictor = TabularPredictor(label='income', eval_metric='balanced_accuracy').fit(
    train_data=train_data,
    time_limit= 400,
    presets='good'
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250519_131920"
Preset alias specified: 'good' maps to 'good_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.0
Python Version:     3.12.3
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:24:23 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T8122
CPU Count:          8
Memory Avail:       1.70 GB / 8.00 GB (21.3%)
Disk Space Avail:   40.40 GB / 228.27 GB (17.7%)
Presets specified: ['good']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by set

In [5]:
predictor2 = TabularPredictor.load("/Users/ola/Library/CloudStorage/OneDrive-Personal/Dokumenty/licencjat/ensemble-diversity-measures/AutogluonModels/ag-20250519_131920")

In [6]:
# dlaczego modele L1 itd nie działają?
predictor2.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L2,0.813387,balanced_accuracy,5.913553,124.60616,0.022496,9.686992,2,False,17
1,WeightedEnsemble_L3,0.813387,balanced_accuracy,5.915235,125.378682,0.001682,0.772522,3,False,24
2,NeuralNetTorch_BAG_L2,0.811508,balanced_accuracy,6.153626,155.247118,0.262569,40.32795,2,False,22
3,WeightedEnsemble_L2,0.804464,balanced_accuracy,0.559399,36.645514,0.002194,0.405046,2,False,12
4,LightGBMLarge_BAG_L2,0.800603,balanced_accuracy,6.52875,118.391538,0.637693,3.47237,2,False,23
5,LightGBM_BAG_L2,0.800229,balanced_accuracy,6.102513,116.964352,0.211456,2.045184,2,False,14
6,RandomForestEntr_BAG_L2,0.799738,balanced_accuracy,6.406686,118.858066,0.515629,3.938898,2,False,16
7,XGBoost_BAG_L1,0.799313,balanced_accuracy,0.384804,5.682478,0.384804,5.682478,1,False,9
8,XGBoost_BAG_L2,0.799119,balanced_accuracy,6.078259,119.373345,0.187202,4.454177,2,False,21
9,RandomForestGini_BAG_L2,0.798786,balanced_accuracy,6.404413,118.666649,0.513356,3.747481,2,False,15


In [7]:
model_names = predictor2.model_names()
model_names.pop(model_names.index('WeightedEnsemble_L2_FULL')) 
model_names.pop(model_names.index('WeightedEnsemble_L3_FULL')) 
mdls = model_names[24:]

In [8]:
mdls

['LightGBMXT_BAG_L1_FULL',
 'LightGBM_BAG_L1_FULL',
 'RandomForestGini_BAG_L1_FULL',
 'RandomForestEntr_BAG_L1_FULL',
 'CatBoost_BAG_L1_FULL',
 'ExtraTreesGini_BAG_L1_FULL',
 'ExtraTreesEntr_BAG_L1_FULL',
 'NeuralNetFastAI_BAG_L1_FULL',
 'XGBoost_BAG_L1_FULL',
 'NeuralNetTorch_BAG_L1_FULL',
 'LightGBMLarge_BAG_L1_FULL',
 'LightGBMXT_BAG_L2_FULL',
 'LightGBM_BAG_L2_FULL',
 'RandomForestGini_BAG_L2_FULL',
 'RandomForestEntr_BAG_L2_FULL',
 'CatBoost_BAG_L2_FULL',
 'ExtraTreesGini_BAG_L2_FULL',
 'ExtraTreesEntr_BAG_L2_FULL',
 'NeuralNetFastAI_BAG_L2_FULL',
 'XGBoost_BAG_L2_FULL',
 'NeuralNetTorch_BAG_L2_FULL',
 'LightGBMLarge_BAG_L2_FULL']

In [7]:
comb7 = list(combinations(mdls, 7))
comb5 = list(combinations(mdls, 5))

In [9]:
mdls[12]

'LightGBM_BAG_L2_FULL'

In [8]:
predictor2.predict(test_data, model=model_names[0])
# modele bez FULL nigdy nie działają

AttributeError: 'NoneType' object has no attribute 'predict'

In [10]:
predictor2.predict(test_data, model=mdls[12]).to_numpy()
# po wczytaniu modeli z pliku to nie działa

array([False,  True, False, ..., False, False, False])

In [13]:
predictor2.predict_proba(test_data, model='LightGBM_BAG_L2_FULL').to_numpy()

array([[9.99081850e-01, 9.18148668e-04],
       [4.79624033e-01, 5.20375967e-01],
       [9.99242902e-01, 7.57073518e-04],
       ...,
       [6.52624846e-01, 3.47375125e-01],
       [7.42681444e-01, 2.57318556e-01],
       [9.79509294e-01, 2.04907097e-02]])

In [11]:
len(mdls)

22

In [12]:
# zapisanie predykcji pstwa i predykcji 0 1
pred_prob = []
pred = []
for mdl in mdls:
    pred_prob.append(predictor2.predict_proba(test_data.drop('income', axis = 1), model=mdl).to_numpy())
    p1 = predictor2.predict(test_data.drop('income', axis = 1), model=mdl).to_numpy()
    pred.append(p1)

In [16]:
import pickle

with open("pred_prob1905", "wb") as fp:   #Pickling
    pickle.dump(pred_prob, fp)

In [17]:
with open("pred_prob1905", "rb") as fp:   # Unpickling
    b = pickle.load(fp)

In [19]:
with open("pred1905", "wb") as fp:   #Pickling
    pickle.dump(pred, fp)

In [None]:
with open("pred1905", "rb") as fp:   # Unpickling
    b = pickle.load(fp)

In [20]:
num = [i for i in range(22)]
comb_num = list(combinations(num, 5))  

In [21]:
# zapisanie wartości miar dla komitetów
entropy_results = []
KW_results = []
ia_results = []
diff_results = []
gd_results = []

for i in range(len(comb_num)): 
    gd_results.append(generalized_diversity(test_data.income, 0.5, pred_prob[comb_num[i][0]], pred_prob[comb_num[i][1]], pred_prob[comb_num[i][2]], pred_prob[comb_num[i][3]], pred_prob[comb_num[i][4]]))
    entropy_results.append(entropy_measure(test_data.income, 0.5, pred_prob[comb_num[i][0]], pred_prob[comb_num[i][1]], pred_prob[comb_num[i][2]], pred_prob[comb_num[i][3]], pred_prob[comb_num[i][4]]))
    KW_results.append(KW_variance(test_data.income, 0.5, pred_prob[comb_num[i][0]], pred_prob[comb_num[i][1]], pred_prob[comb_num[i][2]], pred_prob[comb_num[i][3]], pred_prob[comb_num[i][4]]))
    ia_results.append(ia_measure(test_data.income, 0.5, pred_prob[comb_num[i][0]], pred_prob[comb_num[i][1]], pred_prob[comb_num[i][2]], pred_prob[comb_num[i][3]], pred_prob[comb_num[i][4]]))
    diff_results.append(difficulty_measure(test_data.income, 0.5, pred_prob[comb_num[i][0]], pred_prob[comb_num[i][1]], pred_prob[comb_num[i][2]], pred_prob[comb_num[i][3]], pred_prob[comb_num[i][4]]))
 

KeyboardInterrupt: 

In [None]:
# głosowanie
pred_t = [list(i) for i in zip(*pred)]
res = [[] for i in range (len(comb_num))]
for i in range(len(comb_num)):
    for j in range (len(test_data.income)):
        if sum([pred_t[j][comb_num[i][0]], pred_t[j][comb_num[i][1]], pred_t[j][comb_num[i][2]], pred_t[j][comb_num[i][3]], pred_t[j][comb_num[i][4]]]) > 2:
            res[i].append(1)
        else:
            res[i].append(0)

In [None]:
# zapisanie wartości metryk
scores = []
for i in range (len(comb_num)):
    scores.append([accuracy_score(test_data.income,res[i]), recall_score(test_data.income,res[i]),precision_score(test_data.income,res[i]), roc_auc_score(test_data.income,res[i]), balanced_accuracy_score(test_data.income,res[i])])

In [None]:
df1 = pd.DataFrame(scores)
df1['gd'] = gd_results
df2 = df1.sort_values(by='gd').reset_index(drop=True)
df2 = df2.rename(columns={0: "acc", 1: "recall", 2: "prec", 3:'auc', 4: 'bal_acc'})

In [None]:
df2.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.plot(1- df2.gd, label = "1 - gd")
plt.plot(df2.acc, label = 'acc')
plt.plot(df2.recall, label = 'recall')
plt.plot(df2.prec, label = 'precision')
plt.plot(df2.auc, label = 'auc')
plt.plot(df2.bal_acc, label = 'bal acc')

plt.legend(loc="upper right")
plt.show()

In [None]:
len(res)

In [None]:
max(gd_results)

In [None]:
predictor.model_info('WeightedEnsemble_L3')