In [12]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [13]:
df_bank = pd.read_csv('datasets/bank-marketing.csv')
df_bio = pd.read_csv('datasets/bio-response.csv')
df_blood = pd.read_csv('datasets/blood-transfusion-service-center.csv')
df_breast_cancer = pd.read_csv('datasets/breast-cancer.csv')
df_climate = pd.read_csv('datasets/climate-model-simulation-crashes.csv')
df_credit = pd.read_csv('datasets/credit-g.csv')
df_diabetes = pd.read_csv('datasets/diabetes.csv')
df_eucalyptus = pd.read_csv('datasets/eucalyptus.csv')
df_iris = pd.read_csv('datasets/iris.csv')
df_phishing = pd.read_csv('datasets/phishing-websites.csv')
df_transplant = pd.read_csv('datasets/transplant.csv')

datasets = [df_bank, df_bio, df_blood, df_breast_cancer, df_climate,df_credit, df_diabetes, df_eucalyptus,df_phishing, df_transplant]

In [77]:
import openml
import pandas as pd

# Definindo os IDs dos conjuntos de dados
dataset_ids = [13, 31, 1461]

# Criando um dicion√°rio para armazenar os dataframes
dataframes = {}

# Loop para baixar os datasets e converter em dataframes
for dataset_id in dataset_ids:
    dataset = openml.datasets.get_dataset(dataset_id, download_data=True, download_qualities=True, download_features_meta_data=True)
    df, *_ = dataset.get_data(target=dataset.default_target_attribute)
    dataframes[dataset_id] = df

le = LabelEncoder()
for id in dataset_ids:
    dataframes[id].rename(columns={dataframes[id].columns[-1]: 'target'}, inplace=True)

    dataframes[id]['target'] = le.fit_transform(dataframes[id]['target'])
    dataframes[id]['target'] = dataframes[id]['target'].apply(lambda x: 1 if x == 1 else -1)

    dataframes[id]  = dataframes[id].dropna()
# Salvando os dataframes como arquivos CSV
for dataset_id, df in dataframes.items():
    df.to_csv(f'dataset_{dataset_id}.csv', index=False)


In [79]:
dataframes[31]

In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


def run_cv(X,y,algs,nfolds=10, means_only=False):
    results = {}
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=1111)
    for algo_name, algo in algs:
        results[algo_name] = []
        for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            algo.fit(X_train, y_train, M=40)
            y_pred = algo.predict(X_test)
            results[algo_name].append(accuracy_score(y_test, y_pred))
    results_df = pd.DataFrame.from_dict(results)
    if not means_only:
        return results_df
    else:
        results_means = {}
        for algo_name, algo in algs:
            results_means[algo_name] = [np.mean(results[algo_name])]
        return pd.DataFrame.from_dict(results_means)

In [85]:
import matplotlib.pyplot as plt

def plot_cv(results_cv,metric='Accuracy', title="Cross-validation results for multiple algorithms in a single task"):
    fig, ax = plt.subplots()
    ax.boxplot(results_cv)
    ax.set_xticklabels(results_cv.columns)
    ax.set_ylabel(metric)
    ax.set_title(title)
    plt.show()

In [87]:
from algorithms.adaboost import AdaBoost

models = [
    ("AdaBoost Default", AdaBoost()),
    #("AdaBoost KNN", AdaBoost(knn=True))
]

for ds_id, ds in datasets:
    X = ds.drop(columns=['target'], axis=1)
    y = ds['target']
    results = run_cv(X, y, models)
    #plot_cv(results, title=f"Cross-validation for the algorithms in the dataset with {ds_id}")