# Plotting Few-Shot Model Evaluation Results

Assembling plots from summary files.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Setting up local details:
# This should be the location of the checkout of the FS-Mol repository:
FS_MOL_CHECKOUT_PATH = os.path.join(os.environ['HOME'], "Projects", "FS-Mol")
FS_MOL_DATASET_PATH = os.path.join(os.environ['HOME'], "Datasets", "FS-Mol")

os.chdir(FS_MOL_CHECKOUT_PATH)
sys.path.insert(0, FS_MOL_CHECKOUT_PATH)

from fs_mol.plotting.utils import (
    highlight_max_all, 
    plot_all_assays, 
    load_data,
    expand_values,
    plot_task_performances_by_id,
    aggregate_by_class,
    box_plot,
    plot_by_size,
    get_aggregates_across_sizes
)

## Making summary files

Summary files are obtained by running `fs_mol/plotting/collect_eval_runs.py` on the outputs of evaluation runs. If an evaluation output directory is "evaluation_output_directory" then summary files are created with: 

In [None]:
! python fs_mol/plotting/collect_eval_runs.py {model_name} {evaluation_output_directory} --plot

The option `--plot` results in a plot across support set sizes for each few-shot testing task. Final summarized results will be found in "evaluation_output_directory/summary/{model_name}_summary.csv"

## Loading the collated evaluation data

Create a dictionary of all model summary .csvs to be compared. The csvs are the final summaries from `collect_eval_runs.py`

In [None]:
# Configure this to contain all the models that you want to look at.
# Dict keys are human readable names, values are the path to the summary produced by collect_eval_runs.py
results_path = os.path.join(FS_MOL_DATASET_PATH, "results/")

# a dictionary summarising all models to be compared. Add new paths here as desired.
model_summaries = {
    "GNN-MAML": results_path + ("MAML-Support16_summary.csv"),
#     "PN": results_path + ("PN_summary.csv"),
    "GNN-MT": results_path + "GNN-Multitask_summary.csv",
    "ST": results_path + "random_forest_summary.csv",
    "GNN-ST": results_path + "GNN-ST_summary.csv",
    "kNN": results_path + "kNN_summary.csv",
#     "MAT": results_path + "MAT_summary.csv",
}
# Generated plots will be stored here, if you want to keep them. None disables saving.
plot_output_dir = results_path + "plots/"
os.makedirs(plot_output_dir, exist_ok=True)

In [None]:
data = load_data(model_summaries)

## Highlight the best result for each task

In [None]:
styled_df = data.style.apply(lambda row: highlight_max_all(row), axis=1)
# To save for exporting purposes, uncomment this:
styled_df.to_excel(os.path.join(plot_output_dir, f"all_model_highlighted_comparison.xlsx"), engine='xlsxwriter')

styled_df

In [None]:
# expand out from val +/- error format, and calculate delta AUPRC
data = expand_values(data, model_summaries)

## Performance Overview over all Tasks

This compares with the trivial baseline of using a weighted coinflip according to the class imbalance in the training data.

In [None]:
plot_task_performances_by_id(data, model_summaries, support_set_size = 16)

### Incorporate protein information

Our test tasks have associated target protein information available. We can merge this data to allow plotting with specific EC number classes highlighted.

In [None]:
protein_path =os.path.join(FS_MOL_CHECKOUT_PATH, "datasets/targets/test_proteins.csv")
ecs =pd.read_csv(protein_path)
ecs["target_id"] = ecs["target_id"].astype(int).astype(str)
ecs["chembl_id"] = ecs["chembl_id"].astype(str)
ecs["TASK_ID"] = ecs.apply(lambda row: row["chembl_id"][6:], axis = 1)

data = ecs.merge(data, on="TASK_ID")

In [None]:
# the highlight class is the EC class that will be highlighted in the resulting plot. 
# You may wish to use this for comparison across different EC classes.
plot_task_performances_by_id(data, model_summaries, support_set_size = 16, highlight_class =2)

## Plot for each task, comparing different models

This makes an individual comparison plot over models for each few-shot testing task, across all support set sizes available

In [None]:
plot_all_assays(data, model_summaries.keys(), results_dir = plot_output_dir)

# Summarise the overall performance in box plots

This reproduces the model comparison box plots in the manuscript.

In [None]:
box_plot(data, model_summaries, support_set_size = 16)

## Aggregate as a function of the number of training points, across all categories

Here the results are aggregated according to EC class, and across all classes. This is used to plot the variation of performance with support set size, comparing all models in the model_summaries dictionary. 

In [None]:
aggregate_df = get_aggregates_across_sizes(data, model_summaries)

In [None]:
aggregate_df

In [None]:
# this function has the option to plot all classes separately.
plot_by_size(aggregate_df, model_summaries, plot_output_dir = plot_output_dir)

# Ranking

Here we use [autorank](https://pypi.org/project/autorank/) for an appropriate comparison between all methods when evaluated on multiple tasks.


In [None]:
from autorank import autorank

# select correct data to rank with autorank
for size in [16]:

    df = data[[x for x in list(data.columns) if x.startswith(f"{size}") and "val" in x and "delta-auprc" in x]]

In [None]:
result = autorank(df, verbose=False)
result.rankdf["meanrank"]