# Intro

This notebook is used to visualize and manipulate results of the study. It should be run after the `dvc repro` command was successfully executed. 

# Load libs

In [1]:
import os
import pandas as pd
import yaml

import autoroot
import autorootcwd

In [2]:
import aml_magic.src.consts as cc
import aml_magic.src.models.metrics as model_metrics
import aml_magic.src.utils.configs as cfg




# Load data

## Load config

In [3]:
config_yaml = yaml.safe_load(open(f"{cc.MAIN_PARAMS_FILE}.yaml", "r"))
config = cfg.ExperimentConfig(**config_yaml)

## Load summary data

In [4]:
summary_data = pd.read_csv(cc.STUDY_COMPARISON_PATH / "scores_summary.csv")
summary_data.head(3)

Unnamed: 0,dataset,metric,scope,MAGIC+XGB,GCN,SkipGCN,GCN+XGB,Nenn+XGB,Nenn,SkipGCN+XGB
0,AMLSim 101,F1,Ilicit,0.866 +/- 0.009,0.810 +/- 0.035,0.807 +/- 0.033,0.814 +/- 0.032,0.837 +/- 0.027,0.725 +/- 0.041,0.812 +/- 0.030
1,AMLSim 101,F1,Macro,0.926 +/- 0.005,0.894 +/- 0.020,0.892 +/- 0.019,0.898 +/- 0.018,0.910 +/- 0.015,0.844 +/- 0.024,0.896 +/- 0.017
2,AMLSim 101,Precision,Ilicit,0.930 +/- 0.013,0.724 +/- 0.058,0.719 +/- 0.056,0.786 +/- 0.052,0.828 +/- 0.035,0.592 +/- 0.051,0.779 +/- 0.050


## Load raw scores per dataset

In [5]:
all_raw_scores = []
for dataset in os.listdir(cc.STUDY_RESULTS_PATH):
    dataset_raw_scores = pd.read_csv(cc.STUDY_RESULTS_PATH / dataset / f"{dataset}_MAGIC+xgboost_raw.csv", index_col=0).drop(columns='model')
    dataset_raw_scores["dataset"] = dataset
    all_raw_scores.append(dataset_raw_scores)

all_raw_scores_df = pd.concat(all_raw_scores, ignore_index=True)

# Show results

## Summaries

In [6]:
summary_data.pivot_table(
    index=['dataset', 'metric', 'scope'],
    aggfunc='max'
).sort_index(level=0, ascending=False).style.highlight_max(color='lightgreen', axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GCN,GCN+XGB,MAGIC+XGB,Nenn,Nenn+XGB,SkipGCN,SkipGCN+XGB
dataset,metric,scope,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AMLSim 51,Recall,Macro,0.966 +/- 0.003,0.952 +/- 0.015,0.957 +/- 0.014,0.941 +/- 0.016,0.945 +/- 0.014,0.966 +/- 0.003,0.952 +/- 0.014
AMLSim 51,Recall,Ilicit,0.973 +/- 0.004,0.933 +/- 0.029,0.943 +/- 0.033,0.927 +/- 0.032,0.919 +/- 0.028,0.974 +/- 0.003,0.934 +/- 0.029
AMLSim 51,Precision,Macro,0.917 +/- 0.010,0.931 +/- 0.013,0.936 +/- 0.013,0.903 +/- 0.016,0.928 +/- 0.022,0.917 +/- 0.011,0.931 +/- 0.012
AMLSim 51,Precision,Ilicit,0.840 +/- 0.021,0.878 +/- 0.024,0.936 +/- 0.013,0.823 +/- 0.031,0.873 +/- 0.043,0.840 +/- 0.022,0.876 +/- 0.025
AMLSim 51,F1,Macro,0.939 +/- 0.007,0.941 +/- 0.012,0.946 +/- 0.009,0.921 +/- 0.014,0.936 +/- 0.016,0.939 +/- 0.008,0.941 +/- 0.011
AMLSim 51,F1,Ilicit,0.901 +/- 0.011,0.905 +/- 0.019,0.914 +/- 0.014,0.872 +/- 0.023,0.895 +/- 0.025,0.902 +/- 0.012,0.904 +/- 0.017
AMLSim 31,Recall,Macro,0.973 +/- 0.001,0.970 +/- 0.010,0.973 +/- 0.007,0.963 +/- 0.017,0.970 +/- 0.012,0.973 +/- 0.001,0.969 +/- 0.009
AMLSim 31,Recall,Ilicit,0.984 +/- 0.000,0.971 +/- 0.020,0.987 +/- 0.011,0.975 +/- 0.030,0.976 +/- 0.021,0.984 +/- 0.000,0.970 +/- 0.019
AMLSim 31,Precision,Macro,0.953 +/- 0.001,0.958 +/- 0.006,0.955 +/- 0.011,0.941 +/- 0.015,0.954 +/- 0.014,0.953 +/- 0.002,0.957 +/- 0.006
AMLSim 31,Precision,Ilicit,0.913 +/- 0.002,0.928 +/- 0.010,0.955 +/- 0.011,0.894 +/- 0.023,0.918 +/- 0.024,0.913 +/- 0.003,0.927 +/- 0.011


## Summarize raw scores

Summaries of raw scores below are done without the use of the Bootstrap method.

In [7]:
all_raw_scores_df.groupby("dataset").agg(['mean', 'std'])

Unnamed: 0_level_0,Macro Precision,Macro Precision,Macro Recall,Macro Recall,Macro F1,Macro F1,Ilicit Precision,Ilicit Precision,Ilicit Recall,Ilicit Recall,Ilicit F1,Ilicit F1
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
amlsim_101,0.930039,0.005641,0.923016,0.008222,0.926366,0.001997,0.930039,0.005641,0.858824,0.018166,0.866425,0.003798
amlsim_201,0.918473,0.010095,0.917959,0.011622,0.91804,0.006829,0.918473,0.010095,0.843627,0.023842,0.843801,0.013032
amlsim_31,0.95497,0.004924,0.972888,0.003144,0.963078,0.00407,0.95497,0.004924,0.987255,0.004736,0.950247,0.005372
amlsim_51,0.936239,0.005529,0.957233,0.00617,0.946187,0.003878,0.936239,0.005529,0.943137,0.014468,0.913577,0.006248


## Calculate boostrap CIs

In [8]:
dataset_bootstrap_scores = {}
metrics = [col for col in all_raw_scores_df.columns if col != 'dataset']
for dataset in all_raw_scores_df.dataset.unique():
    dataset_scores = all_raw_scores_df[all_raw_scores_df.dataset == dataset]
    for metric in metrics:
        boostrap_avg_ci, bootstrap_std_ci = model_metrics.get_confidence_intervals(dataset_scores[metric], n_repeats=config.n_repeats)
        dataset_bootstrap_scores[(dataset, metric)] = {
            "avg": boostrap_avg_ci,
            "std": bootstrap_std_ci
        }
dataset_bootstrap_scores_df = pd.DataFrame(dataset_bootstrap_scores).T

In [9]:
dataset_bootstrap_scores_df.round(3)

Unnamed: 0,Unnamed: 1,avg,std
amlsim_31,Macro Precision,0.955,0.011
amlsim_31,Macro Recall,0.973,0.007
amlsim_31,Macro F1,0.963,0.009
amlsim_31,Ilicit Precision,0.955,0.011
amlsim_31,Ilicit Recall,0.987,0.011
amlsim_31,Ilicit F1,0.95,0.012
amlsim_101,Macro Precision,0.93,0.013
amlsim_101,Macro Recall,0.923,0.019
amlsim_101,Macro F1,0.926,0.005
amlsim_101,Ilicit Precision,0.93,0.013
