## 2a. Evidence - Fairnesss QAS Measurements

Evidence collected in this section checks for the Fairness scenario defined in the previous step. Note that some functions will be loaded from external Python files.

### Initialize MLTE Context

MLTE contains a global context that manages the currently active _session_. Initializing the context tells MLTE how to store all of the artifacts that it produces.

In [1]:
import os
from mlte.session import set_context, set_store

store_path = os.path.join(os.getcwd(), "store")
os.makedirs(
    store_path, exist_ok=True
)  # Ensure we are creating the folder if it is not there.

set_context("OxfordFlower", "0.0.1")
set_store(f"local://{store_path}")

Define different folders that will be used as input or output for the data gathering process.

In [2]:
from pathlib import Path

# The path at which datasets are stored
DATASETS_DIR = Path.cwd() / "data"

General functions and external imports.

In [3]:
# General functions.

import garden
import numpy as np


def load_data(data_folder: str):
    """Loads all garden data results and taxonomy categories."""
    df_results = garden.load_base_results(data_folder)
    df_results.head()

    # Load the taxonomic data and merge with results.
    df_info = garden.load_taxonomy(data_folder)
    df_results.rename(columns={"label": "Label"}, inplace=True)
    df_all = garden.merge_taxonomy_with_results(df_results, df_info)

    return df_info, df_all


def split_data(df_info, df_all):
    """Splits the data into 3 different populations to evaluate them."""
    df_gardenpop = df_info.copy()
    df_gardenpop["Population1"] = (
        np.around(
            np.random.dirichlet(np.ones(df_gardenpop.shape[0]), size=1)[0],
            decimals=3,
        )
        * 1000
    ).astype(int)
    df_gardenpop["Population2"] = (
        np.around(
            np.random.dirichlet(np.ones(df_gardenpop.shape[0]), size=1)[0],
            decimals=3,
        )
        * 1000
    ).astype(int)
    df_gardenpop["Population3"] = (
        np.around(
            np.random.dirichlet(np.ones(df_gardenpop.shape[0]), size=1)[0],
            decimals=3,
        )
        * 1000
    ).astype(int)
    df_gardenpop

    # build populations from test data set that match the garden compositions
    from random import choices

    # build 3 gardens with populations of 1000.
    pop_names = ["Population1", "Population2", "Population3"]
    gardenpops = np.zeros((3, 1000), int)
    gardenmems = np.zeros((3, 1000), int)

    for j in range(1000):
        for i in range(len(df_gardenpop)):
            my_flower = df_gardenpop.iloc[i]["Common Name"]

            for g in range(3):
                n_choices = df_gardenpop.iloc[i][pop_names[g]]
                my_choices = df_all[df_all["Common Name"] == my_flower][
                    "model correct"
                ].to_list()
                my_selection = choices(my_choices, k=n_choices)

                gardenpops[g][j] += sum(my_selection)
                gardenmems[g][j] += len(my_selection)

    gardenpops

    return gardenpops, gardenmems


def calculate_model_performance_acc(gardenpops, gardenmems):
    """Get accucray of models across the garden populations"""
    gardenacc = np.zeros((3, 1000), float)
    for i in range(1000):
        for g in range(3):
            gardenacc[g][i] = gardenpops[g][i] / gardenmems[g][i]
    gardenacc

    model_performance_acc = []
    for g in range(3):
        avg = round(np.average(gardenacc[g][:]), 3)
        std = round(np.std(gardenacc[g][:]), 3)
        min = round(np.amin(gardenacc[g][:]), 3)
        max = round(np.amax(gardenacc[g][:]), 3)
        model_performance_acc.append(round(avg, 3))

        print("%1d %1.3f %1.3f %1.3f %1.3f" % (g, avg, std, min, max))

    return model_performance_acc

In [None]:
# Prepare the data. For this section, instead of executing the model, we will use CSV files containing the results of an already executed run of the model.
data = load_data(DATASETS_DIR)
split_data = split_data(data[0], data[1])

In this first example, we simply wrap the output from `accuracy_score` with a custom `Result` type to cope with the output of a third-party library that is not supported by a MLTE builtin.

In [None]:
from values.multiple_accuracy import MultipleAccuracy
from mlte.measurement.external_measurement import ExternalMeasurement

# Evaluate accuracy, identifier has to be the same one defined in the Spec.
accuracy_measurement = ExternalMeasurement(
    "accuracy across gardens", MultipleAccuracy, calculate_model_performance_acc
)
accuracy = accuracy_measurement.evaluate(split_data[0], split_data[1])

# Inspect value
print(accuracy)

# Save to artifact store
accuracy.save(force=True)