    # ACC: Accuracy	-> (Number of Correct Predictions) / (Total Number of Predictions
    # FPR: False Positive Rate	-> Number of FP / (FP +TN)
    # AUC: Area under the ROC curve	-> scikit-learn
    # Ya: Yield of actives 	-> TP/(TP+FP)
    # EF: Enrichment Factor	-> ((TP)/(TP+FP))((tp+fn)/(tp+tn+fp+fn))
    # REF: Relative Enrichment Factor -> 100*tp/min(tp+fp,tp+fn)

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, auc, confusion_matrix, ConfusionMatrixDisplay


import os 
from pathlib import Path

In [7]:
# paths
base_dir = Path(os.getcwd())/"implementation"
result_dir = base_dir / "data/results/"


In [8]:
def calc_metrics_from_result_df(df: pd.DataFrame, name: str):
    label = df["LABEL"]
    pred = df["PRED"]

    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0

    for i, v in enumerate(pred):
        if v == 1 and label[i] == 1:
            true_positive += 1
        elif v == 1 and label[i] == 0:
            false_positive += 1
        elif v == 0 and label[i] == 0:
            true_negative += 1
        else:
            false_negative += 1

    resultdf = pd.DataFrame()
    resultdf["name"] = [name]
    resultdf["ACC"] = [(true_positive + true_negative) / len(label)]
    resultdf["FPR"] = [(false_positive) / (false_positive + true_negative)]
    resultdf["AUC"] = roc_auc_score(label, pred)
    resultdf["YA"] = true_positive / (true_positive+false_positive)
    resultdf["EF"] = [
        ((true_positive) / (true_positive + false_positive))
        / ((true_positive + false_negative) / (len(label)))
    ]
    resultdf["REF"] = [
        (100 * true_positive)
        / min((true_positive + false_positive), (true_positive + false_negative))
    ]
    return resultdf


def print_roc_curve(df: pd.DataFrame,path:Path):
    label = df["LABEL"]
    pred = df["PRED"]

    fpr, tpr, thresholds = roc_curve(label, pred)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(
        fpr=fpr,
        tpr=tpr,
        roc_auc=roc_auc,
        estimator_name="ROC Curve",
    )
    display.plot()
    #plt.show()
    plt.savefig(path)


def print_conf_matrix(df: pd.DataFrame,path:Path):
    label = df["LABEL"]
    pred = df["PRED"]
    label = ["active" if i == 1 else "inactive" for i in label]
    pred = ["active" if i == 1 else "inactive" for i in pred]

    cm = confusion_matrix(label, pred, labels=["active","inactive"])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["active","inactive"])
    disp.plot()
    plt.savefig(path)

    


### load results


In [9]:
rf = {
    "ache": pd.read_csv(result_dir / "ACHE/baseline_rf.csv"),
    "cox1": pd.read_csv(result_dir / "COX1/baseline_rf.csv"),
    "dpp4": pd.read_csv(result_dir / "DPP4/baseline_rf.csv"),
    "maob": pd.read_csv(result_dir / "MAOB/baseline_rf.csv"),
    "seh": pd.read_csv(result_dir / "SEH/baseline_rf.csv"),
}
knn = {
    "ache": pd.read_csv(result_dir / "ACHE/fe_rf_per_knn.csv"),
    "cox1": pd.read_csv(result_dir / "COX1/fe_rf_per_knn.csv"),
    "dpp4": pd.read_csv(result_dir / "DPP4/fe_rf_per_knn.csv"),
    "maob": pd.read_csv(result_dir / "MAOB/fe_rf_per_knn.csv"),
    "seh": pd.read_csv(result_dir / "SEH/fe_rf_per_knn.csv"),
}
nn = {
    "ache": pd.read_csv(result_dir / "ACHE/fe_smote_nn.csv"),
    "cox1": pd.read_csv(result_dir / "COX1/baseline_nn.csv"),
    "dpp4": pd.read_csv(result_dir / "DPP4/baseline_nn.csv"),
    "maob": pd.read_csv(result_dir / "MAOB/baseline_nn.csv"),
    "seh": pd.read_csv(result_dir / "SEH/baseline_nn.csv"),
}

ml = {"rf":rf,
      "nn":nn,
      "knn":knn}

In [10]:
ml_metrics = pd.DataFrame(columns=["Name","ACC","FPR","AUC","YA","EF","REF"])
for name,a in ml.items():
    r = pd.DataFrame()
    for k, v in a.items():
        r = pd.concat([r, calc_metrics_from_result_df(v, name=k)])
    r = r.drop(columns="name")
    r = r.aggregate(["mean"])
    row = [name]
    row.extend(r.loc["mean",:].values.flatten().tolist())
    ml_metrics.loc[len(ml_metrics["Name"])] = row

ml_metrics = ml_metrics.sort_values("ACC", ascending=False)

print(
    ml_metrics.to_latex(
        index=False,
        float_format="{:.4f}".format,
        escape=True,
        caption="machine learning algorithms comparison",
    )
)

\begin{table}
\caption{machine learning algorithms comparison}
\begin{tabular}{lrrrrrr}
\toprule
Name & ACC & FPR & AUC & YA & EF & REF \\
\midrule
rf & 0.7762 & 0.1380 & 0.6916 & 0.8276 & 2.3596 & 85.8631 \\
knn & 0.7352 & 0.2292 & 0.6684 & 0.6387 & 1.7419 & 69.6539 \\
nn & 0.7246 & 0.1937 & 0.6530 & 0.6215 & 1.6647 & 63.6876 \\
\bottomrule
\end{tabular}
\end{table}

