## 2c. Evidence - Resilience QAS Measurements

Evidence collected in this section checks for the Resilence scenario defined in the previous step. Note that some functions will be loaded from external Python files.

### Initialize MLTE Context

MLTE contains a global context that manages the currently active _session_. Initializing the context tells MLTE how to store all of the artifacts that it produces. This import will also set up global constants related to folders and model to use.

In [7]:
# Sets up context for the model being used, sets up constants related to folders and model data to be used.
from demo.scenarios.session import *

### Helper Functions

General functions and external imports.

In [8]:
# General functions.
from demo.scenarios import garden
import pandas as pd


def calculate_base_accuracy(df_results: pd.DataFrame) -> pd.DataFrame:
    # Calculate the base model accuracy result per data label
    df_pos = (
        df_results[df_results["model correct"] == True].groupby("label").count()
    )
    # df_pos.drop(columns=["predicted_label"], inplace=True)
    df_neg = (
        df_results[df_results["model correct"] == False]
        .groupby("label")
        .count()
    )
    # df_neg.drop(columns=["predicted_label"], inplace=True)
    df_neg.rename(columns={"model correct": "model incorrect"}, inplace=True)
    df_res = df_pos.merge(
        df_neg, right_on="label", left_on="label", how="outer"
    )
    df_res.fillna(0, inplace=True)
    df_res["model acc"] = df_res["model correct"] / (
        df_res["model correct"] + df_res["model incorrect"]
    )
    df_res["count"] = df_res["model correct"] + df_res["model incorrect"]
    df_res.drop(columns=["model correct", "model incorrect"], inplace=True)
    df_res.head()

    return df_res


def calculate_accuracy_per_set(
    data_folder: str, df_results: pd.DataFrame, df_res: pd.DataFrame
) -> pd.DataFrame:
    # Calculate the model accuracy per data label for each blurred data set
    base_filename = "predictions_test"
    ext_filename = ".csv"
    set_filename = ["_blur2x8", "_blur5x8", "_blur0x8", "_noR", "_noG", "_noB"]

    col_root = "model acc"

    for fs in set_filename:
        filename = os.path.join(data_folder, base_filename + fs + ext_filename)
        colname = col_root + fs

        df_temp = pd.read_csv(filename)
        # print(df_temp.head())
        df_temp = df_temp[["model correct", "label"]]

        df_pos = (
            df_temp[df_temp["model correct"] == True].groupby("label").count()
        )
        # df_pos.drop(columns=["predicted_label"], inplace=True)
        df_neg = (
            df_results[df_results["model correct"] == False]
            .groupby("label")
            .count()
        )
        # df_neg.drop(columns=["predicted_label"], inplace=True)
        df_neg.rename(
            columns={"model correct": "model incorrect"}, inplace=True
        )
        df_res2 = df_pos.merge(
            df_neg,
            right_on="label",
            left_on="label",
            how="outer",
        ).fillna(0)
        df_res2.fillna(0, inplace=True)

        df_res2[colname] = df_res2["model correct"] / (
            df_res2["model correct"] + df_res2["model incorrect"]
        )
        df_res2.drop(columns=["model correct", "model incorrect"], inplace=True)

        # print(f"{fs}_DF_RES={df_res.tail()}")
        # print(f"{fs}_DF_RES2={df_res2.tail()}")
        df_res = df_res.merge(
            df_res2, right_on="label", left_on="label", how="outer"
        ).fillna(0)

    # df_res.head()
    return df_res


def print_model_accuracy(df_res: pd.DataFrame, key: str, name: str):
    model_acc = sum(df_res[key] * df_res["count"]) / sum(df_res["count"])
    print(name, model_acc)

In [9]:
# Prepare all data. Same as the case above, we will use CSV files that contain results of a previous execution of the model.
df_results = garden.load_base_results(DATASETS_DIR, "predictions_test.csv")
df_results = df_results[["model correct", "label"]]
df_res = calculate_base_accuracy(df_results)
df_res = calculate_accuracy_per_set(DATASETS_DIR, df_results, df_res)
df_info = garden.load_taxonomy(DATASETS_DIR)
df_all = garden.merge_taxonomy_with_results(df_res, df_info, "label", "Label")

# fill in missing model accuracy data
df_all["model acc_noR"] = df_all["model acc_noR"].fillna(0)
df_all["model acc_noG"] = df_all["model acc_noG"].fillna(0)
df_all["model acc_noB"] = df_all["model acc_noB"].fillna(0)

102 102 102


### Measurements

Now do the actual measurements. First simply see the model accuracy across channel loss.

In [10]:
# view changes in model accuracy
print_model_accuracy(df_res, "model acc", "base model accuracy")
print_model_accuracy(
    df_res, "model acc_noR", "model accuracy with no red channel"
)
print_model_accuracy(
    df_res, "model acc_noG", "model accuracy with no green channel"
)
print_model_accuracy(
    df_res, "model acc_noB", "model accuracy with no blue channel"
)

base model accuracy 0.947265625
model accuracy with no red channel 0.8943445335046898
model accuracy with no green channel 0.9276878720238095
model accuracy with no blue channel 0.8652988450351732


In [11]:
df_res

Unnamed: 0_level_0,model acc,count,model acc_blur2x8,model acc_blur5x8,model acc_blur0x8,model acc_noR,model acc_noG,model acc_noB
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
1,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
2,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
3,0.9,10.0,0.9,0.888889,0.666667,0.75,0.8,0.5
4,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
...,...,...,...,...,...,...,...,...
97,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
98,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
99,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0
100,1.0,10.0,1.0,1.000000,1.000000,1.00,1.0,1.0


Measure the ranksums (p-value) for all blur cases, using `scipy.stats.ranksums` and the `ExternalMeasurement` wrapper.

In [14]:
import scipy.stats

from mlte.evidence.types.array import Array
from mlte.measurement.external_measurement import ExternalMeasurement

def run_ranksum(samp1, samp2):
    res = scipy.stats.ranksums(samp1,samp2)
    float_list = [float(x) for x in res]
    #print(float(res))
    return float_list

my_blur = ["R", "G", "B"]
for i in range(len(my_blur)):
    # Define measurements.
    ranksum_measurement = ExternalMeasurement(
        f"ranksums channel loss {my_blur[i]}", Array, scipy.stats.ranksums
    )

    # Evaluate.
    ranksum: Array = ranksum_measurement.evaluate(
        df_res["model acc"], df_res[f"model acc_no{my_blur[i]}"]
    )
    print(f"blur {my_blur[i]}: {ranksum}")

    # Inspect values
    print(ranksum)

    # Save to artifact store
    ranksum.save(force=True)

blur R: RanksumsResult(statistic=np.float64(0.8052918417241214), pvalue=np.float64(0.4206512885130542))
RanksumsResult(statistic=np.float64(0.8052918417241214), pvalue=np.float64(0.4206512885130542))
blur G: RanksumsResult(statistic=np.float64(0.6356942962652858), pvalue=np.float64(0.5249756947411197))
RanksumsResult(statistic=np.float64(0.6356942962652858), pvalue=np.float64(0.5249756947411197))
blur B: RanksumsResult(statistic=np.float64(0.7673399434396266), pvalue=np.float64(0.44287942555285786))
RanksumsResult(statistic=np.float64(0.7673399434396266), pvalue=np.float64(0.44287942555285786))
