### Import and setup (only run once)

In [1]:
import time

from bayesian_decision_tree.classification import PerpendicularClassificationTree, HyperplaneClassificationTree
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

results = pd.DataFrame(columns=["dt", "bdt", "rf", "xgb", "lgb"])
results.index.name = "problem"
times = results.copy()

def quad_weights(n: int) -> list:
    """Creates a quadratically increasing sequence summing to 1"""
    return [2 *i/(n**2 + n) for i in range(1, n+1)]

### Parameters

In [2]:
parameters = pd.DataFrame([
    # Name                n    d  info  redund  repeat n_class  n_clust flip_y       weights  random_state 
    ("Simple",        10000, 100,   30,      5,      1,      4,       2,  0.01,      "equal",          314),
    ("Complicated",   10000, 500,  450,      5,      1,     10,       5,  0.10,      "equal",          314),
    ("Small n",         500, 100,   30,      5,      1,      4,       2,  0.01,      "equal",          314),
    ("Large d",       10000, 999,   30,      5,      1,      4,       2,  0.01,      "equal",          314),
    ("Many classes",  10000, 100,   60,      5,      1,     50,       2,  0.01,      "equal",          314),
    ("Distribuded",   10000, 100,   30,      5,      1,      4,     100,  0.01,      "equal",          314),
    ("High noise",    10000, 100,   30,      5,      1,      4,       2,  0.30,      "equal",          314),
    ("Imbalanced S",  10000, 100,   30,      5,      1,      4,       2,  0.01,  "quadratic",          314),
    ("Imbalanced C",  10000, 500,  450,      5,      1,     10,       5,  0.10,  "quadratic",          314),
], columns = [
    "name", "n_samples", "n_features", "n_informative", "n_redundant", "n_repeated", 
    "n_classes", "n_clusters_per_class", "flip_y", "weights", "random_state"
])

### Calculations

In [3]:
for par in parameters.to_dict(orient="records"):
    print(par)
    par["weights"] = None if par["weights"] == "equal" else quad_weights(par["n_classes"])
    X, y = make_classification(**{k: v for k, v in par.items() if k != "name"})

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    bdt = PerpendicularClassificationTree(prior=np.ones(par["n_classes"]))
    dt = DecisionTreeClassifier()
    rf = RandomForestClassifier()
    xgb = XGBClassifier(use_label_encoder=False)
    lgb = LGBMClassifier()

    t0 = time.time()
    bdt.fit(X_train, y_train)
    t1 = time.time()
    dt.fit(X_train, y_train)
    t2 = time.time()
    rf.fit(X_train, y_train)
    t3 = time.time()
    xgb.fit(X_train, y_train)
    t4 = time.time()
    lgb.fit(X_train, y_train)
    t5 = time.time()

    t = {
        "bdt": t1 - t0,
        "dt": t2 - t1,
        "rf": t3 - t2,
        "xgb": t4 - t3,
        "lgb": t5 - t4,
    }
    r = {
        "bdt": roc_auc_score(y_test, bdt.predict_proba(X_test), average="macro", multi_class="ovo"),
        "dt": roc_auc_score(y_test, dt.predict_proba(X_test), average="macro", multi_class="ovo"),
        "rf": roc_auc_score(y_test, rf.predict_proba(X_test), average="macro", multi_class="ovo"),
        "xgb": roc_auc_score(y_test, xgb.predict_proba(X_test), average="macro", multi_class="ovo"),
        "lgb": roc_auc_score(y_test, lgb.predict_proba(X_test), average="macro", multi_class="ovo"),
    }

    results.loc[par["name"]] = r
    times.loc[par["name"]] = t
    print(results)

{'name': 'Simple', 'n_samples': 10000, 'n_features': 100, 'n_informative': 30, 'n_redundant': 5, 'n_repeated': 1, 'n_classes': 4, 'n_clusters_per_class': 2, 'flip_y': 0.01, 'weights': 'equal', 'random_state': 314}
               dt       bdt        rf       xgb       lgb
problem                                                  
Simple   0.688688  0.782134  0.946932  0.975795  0.977155
{'name': 'Complicated', 'n_samples': 10000, 'n_features': 500, 'n_informative': 450, 'n_redundant': 5, 'n_repeated': 1, 'n_classes': 10, 'n_clusters_per_class': 5, 'flip_y': 0.1, 'weights': 'equal', 'random_state': 314}
                   dt       bdt        rf       xgb       lgb
problem                                                      
Simple       0.688688  0.782134  0.946932  0.975795  0.977155
Complicated  0.501369  0.500000  0.531468  0.602985  0.598926
{'name': 'Small n', 'n_samples': 500, 'n_features': 100, 'n_informative': 30, 'n_redundant': 5, 'n_repeated': 1, 'n_classes': 4, 'n_clusters_per

### Write out

In [4]:
results.reset_index().to_csv("../data/results.csv", index=False)
times.reset_index().to_csv("../data/times.csv", index=False)

Run main.py on the python script in `scr/` to complete the results.

### Plots

In [23]:
results = pd.read_csv("../data/results.csv").set_index("problem")
fig = go.Figure(
    data=[
    *map(lambda x: go.Bar(name=x, x=results.index, y= results[x]), results.columns)
    ])
fig.update_layout(title={"text": "ROC AUC are best for boosting algorithms"},
                 yaxis={"title": {"text": "ROC"}},
                 xaxis={"title": {"text": "Problem"}})
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [24]:
times = pd.read_csv("../data/times.csv").set_index("problem")
fig = go.Figure(data=[
    *map(lambda x: go.Bar(name=x, x=times.index, y= times[x]), times.columns)
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_layout(title={"text": "Long training times for XGBoost"},
                 yaxis={"title": {"text": "Time [s]"}},
                 xaxis={"title": {"text": "Problem"}})
fig.show()