In [1]:
from complexity_guided_ensemble import ComplexityGuidedEnsemble
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

# Criar dados
X, y = make_classification(n_samples=1000, n_classes=2, 
                          weights=[0.9, 0.1], random_state=42)

# Criar e treinar ensemble
ensemble = ComplexityGuidedEnsemble(
    n_estimators=10,
    complexity_type='overlap',
    use_active_learning=True,
    n_jobs=-1,  
    random_state=42
)

# Cross-validation
scores = cross_val_score(ensemble, X, y, cv=5, scoring='f1_weighted')
print(f"F1: {scores.mean():.4f} ± {scores.std():.4f}")
# Output: F1: 0.8523 ± 0.0234 ✅

F1: 0.9500 ± 0.0174


In [6]:
#Teste para uma unica base de dados

import os
import numpy as np

import pandas as pd

from sklearn.metrics import f1_score, roc_auc_score
from complexity_guided_ensemble import ComplexityGuidedEnsemble



pd.options.mode.chained_assignment = None  # Disable SettingWithCopyWarning
np.random.seed(42)  

# complexity_types = pd.read_csv("complex_instances_var_0.1to0.9.csv", index_col=0)

# max_var_complex = complexity_types.idxmax(axis=1)


diretorio = './_extra/imb_multiclass/'
base_name = "glass"
results = []
mus = np.linspace(0, 1, 11).tolist()

for i in range(1,6):
    print(f"Processing {base_name} - {i}")
    if os.path.exists(f"{diretorio}{base_name}/{base_name}-transformed/"):
            file_dir = f"{base_name}-transformed/"
    else:
        file_dir = f"{base_name}-5-fold/"
        
    data_train = pd.read_csv( f"{diretorio}{base_name}/{file_dir}/{base_name}-5-{i}tra.dat"  , header=None, delimiter=',', comment='@') 
    data_test = pd.read_csv( f"{diretorio}{base_name}/{file_dir}/{base_name}-5-{i}tst.dat"  , header    =None, delimiter=',', comment='@')  


    np_data_train = data_train.to_numpy()
    np_data_test = data_test.to_numpy()

    X_train_imb =  np.asarray(np_data_train[:,:-1], dtype=np.float32)
    y_train_imb = np.asarray(np_data_train[:,-1], dtype=np.int32)
    X_test = np_data_test[:,:-1]
    y_test = np_data_test[:,-1]


    ensemble = ComplexityGuidedEnsemble(
        n_estimators=50,
        complexity_type='overlap',
        n_jobs=-1,  
        random_state=42
    )

    ensemble.fit(X_train_imb, y_train_imb)
    y_pred = ensemble.predict(X_test)

    results.append( f1_score(y_test, y_pred, average="weighted"))

print("F1 Scores:", results)
print("Média F1:", np.mean(results))

Processing glass - 1
Processing glass - 2
Processing glass - 3
Processing glass - 4
Processing glass - 5
F1 Scores: [0.7172093023255814, 0.7443947525342876, 0.7704822309473472, 0.5420366455250176, 0.6626117609988578]
Média F1: 0.6873469384662183


In [None]:
#Testando o algoritomo para todas as bases de dados 

from complexity_guided_ensemble import ComplexityGuidedEnsemble

import numpy as np

import pandas as pd

import os 
from sklearn.metrics import f1_score, roc_auc_score

import json
np.random.seed(42)
pd.options.mode.chained_assignment = None

diretorio = './_extra/imb_multiclass/'
arquivos = os.listdir(diretorio)
print(arquivos)

results_f1 = {}

results_auc = {}

errors = []


for base_name in arquivos[:1]:
        

    print(f"Testando base {base_name}")
    results_f1[base_name] = []
    # results_auc[base_name] = []
    
    
    for i in range(1,6):
        
        
        print(f"Fold {i} de 5 !!!!")

        if os.path.exists(f"{diretorio}{base_name}/{base_name}-transformed/"):
            file_dir = f"{base_name}-transformed/"
        else:
            file_dir = f"{base_name}-5-fold/"
        
        data_train = pd.read_csv( f"{diretorio}{base_name}/{file_dir}/{base_name}-5-{i}tra.dat"  , header=None, delimiter=',', comment='@') 
        data_test = pd.read_csv( f"{diretorio}{base_name}/{file_dir}/{base_name}-5-{i}tst.dat"  , header    =None, delimiter=',', comment='@') 
    


        np_data_train = data_train.to_numpy()
        np_data_test = data_test.to_numpy()
        
        X_train_imb = np_data_train[:,:-1]
        y_train_imb = np.asarray(np_data_train[:,-1], dtype=np.int32)
        X_test = np_data_test[:,:-1]
        y_test = np_data_test[:,-1]
        
        ensemble = ComplexityGuidedEnsemble(
            n_estimators=50,
            complexity_type='neighborhood',
            # hardness_function=max_var_complex[base_name],
            n_jobs=-1,  
            random_state=42,
        )

        ensemble.fit(X_train_imb, y_train_imb)
        y_pred = ensemble.predict(X_test)
        # y_pred_proba = ensemble.predict_proba(X_test)

        results_f1[base_name].append( f1_score(y_test, y_pred, average="weighted"))
        # results_auc[base_name].append( roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted'))

    # with open('results/results_new_code_f1_neighborhood_multiclass_2.json', 'w') as f:
    #     json.dump(results_f1, f)

    # with open('results/results_new_code_auc_with_hardness_multiclass.json', 'w') as f:
    #     json.dump(results_auc, f)

In [None]:
# Organizando os resultados de todas as bases 

import pandas as pd
import json

# 1. Ler o arquivo .json
with open("results/results_new_code_f1_error_rate_multiclass.json", "r") as f:
    data = json.load(f)

# 2. Converter para DataFrame (cada chave vira uma coluna)
df = pd.DataFrame(data)

# 3. Calcular a média de cada coluna
df_medias = df.mean().reset_index()
df_medias.columns = ["dataset", "media"]

print(df)
print(df_medias)

# 4. (Opcional) salvar em CSV ou outro formato
df_medias.to_csv("medias_datasets.csv", index=False)

      autos   balance  contraceptive  dermatology     ecoli     glass  \
0  0.774621  0.768056       0.559167     0.914756  0.860073  0.717618   
1  0.861025  0.759361       0.531461     0.959190  0.884215  0.730356   
2  0.843287  0.826730       0.489512     0.958122  0.870170  0.690928   
3  0.877059  0.820146       0.465091     0.944281  0.900420  0.583231   
4  0.782532  0.810552       0.512404     0.985827  0.875413  0.679110   

   hayes-roth  lymphography  new-thyroid  pageblocks  penbased   shuttle  \
0    0.924897      0.691111     0.820252    0.931864  0.900122  1.000000   
1    0.814815      0.731313     0.932325    0.934196  0.932165  0.996560   
2    0.769231      0.933814     0.953488    0.929264  0.876102  0.998076   
3    0.844600      0.862069     0.928121    0.953327  0.899939  0.995634   
4    0.766900      0.865287     0.977442    0.913251  0.890872  0.997694   

    thyroid      wine     yeast  
0  0.980886  0.972263  0.557838  
1  0.981889  0.944444  0.525838  
2 

In [None]:
#concatena o resultado das 3 variações de medidas de complexidade usadas no modelo

complexities = ["error_rate", "overlap", "neighborhood"]

df_medias = pd.DataFrame()

for comp in complexities:
    with open(f"results/results_new_code_f1_{comp}_multiclass.json", "r") as f:
        results = json.load(f)

    results_pd = pd.DataFrame(results)
    df_medias[comp] = results_pd.mean()


In [23]:
df_medias.to_csv("summary_results_new_code_complexities.csv", sep=",")