# To compare model outputs
This notebook is for comparing the computed_scenarios.json outputs from different model versions of microCovid. For example, you might want to compare the model output before and after a pull request is merged. To do that, run the tests before and after the change, save both versions of computed_scenarios.json under different names, and tell us what they are here ("reference_file" for before the change and "test_file" for after, is the intention).

If you want to see or change the "test" that generates the computed_scenarios.json file, look at `src/data/__tests__/scenarios.test.ts`.

In [None]:
reference_file = "computed_scenarios_recent.json"
test_file = "computed_scenarios_1421.json"

and then run the rest of this notebook to play with the data changes.

In [None]:
import json
import re

import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.display import HTML
from pandas import NA

In [None]:
def load(filename):
    with open(filename) as file:
        blob = json.load(file)
    table = pd.json_normalize(blob)
    return table.convert_dtypes()[sorted(table.columns)]

In [None]:
reference = load(reference_file)
test = load(test_file)

In [None]:
JOIN_KEYS = ["scenario", "loc", "vaccination"]
combined = test.join(
    reference.set_index(JOIN_KEYS), lsuffix=".test", rsuffix=".reference", on=JOIN_KEYS, how="outer"
)
combined = combined[sorted(combined.columns)]
combined["ratio"] = combined["result.expectedValue.test"] / combined["result.expectedValue.reference"]
combined.rename(columns=lambda s: re.sub(r"([a-z])([.A-Z])", r"\1​\2", s), inplace=True)

In [None]:
def group_by_ratio(df, only_first):
    by_ratio = df.sort_values("ratio", kind="stable")
    if only_first:
        by_ratio = by_ratio.groupby("ratio").first()
        by_ratio["ratio"] = by_ratio.index
    by_ratio = by_ratio[["ratio", *(col for col in by_ratio.columns if col != "ratio")]]
    return (
        by_ratio.set_index(["loc", "scenario", "vaccination"])
        .sort_index(kind="stable")
        .sort_values("ratio", kind="stable")
    )


def show_only_changes(df):
    df = df.copy()
    for col in df.columns:
        if col.endswith(".test"):
            test_col = col
            ref_col = col.replace(".test", ".reference")
        else:
            continue
        for row in df.index:
            test_val = df[test_col][row]
            ref_val = df[ref_col][row]
            if test_val is not NA and ref_val is not NA and test_val == ref_val:
                df[test_col][row] = NA
                df[ref_col][row] = NA
        if all(value is pd.NA for col in (test_col, ref_col) for value in df[col]):
            df.drop(test_col, axis=1, inplace=True)
            df.drop(ref_col, axis=1, inplace=True)
    return df


def blank_na(df):
    return HTML(df.style.to_html().replace("<NA>", ""))

# The next table will show highlights of what's changed.
The ratio between the test and reference "result.expectedValue" outputs is in the "ratio" column. This table shows
select rows sorted by that ratio, giving an overview that suggests which kinds of scenarios changed by how much.

In [None]:
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.max_columns", 100)
blank_na(show_only_changes(group_by_ratio(combined, only_first=True)))

# The next table will show all changes.
As above, but all rows are included.

In [None]:
blank_na(show_only_changes(group_by_ratio(combined, only_first=False)))