## 2k. Evidence - Repeatability QAS Measurements

Evidence collected in this section checks for the Repeatability scenario defined in the previous step. Note that some functions will be loaded from external Python files.

### Initialize MLTE Context

MLTE contains a global context that manages the currently active _session_. Initializing the context tells MLTE how to store all of the artifacts that it produces. This import will also set up global constants related to folders and model to use.

In [1]:
# Sets up context for the model being used, sets up constants related to folders and model data to be used.
from demo.scenarios.session import *

Creating initial custom lists at URI: local:///Users/rbrowersinning/Documents/ResearchFolders/Continuum_LTP/GitRepos/mlte/demo/scenarios/../store
Loaded 7 qa_categories for initial list
Loaded 30 quality_attributes for initial list
Creating sample catalog at URI: StoreType.LOCAL_FILESYSTEM:local:///Users/rbrowersinning/Documents/ResearchFolders/Continuum_LTP/GitRepos/mlte/demo/scenarios/../store
Loading sample catalog entries.
Loaded 9 entries for sample catalog.


### Helper Functions
General functions and external imports.

In [2]:
# General functions.

from demo.scenarios import garden
import numpy as np
import pandas as pd
from scipy import stats

def load_data(data_folder: str):
    """Loads all garden data results and taxonomy categories."""
    df_results = garden.load_base_results(data_folder, "predictions_test.csv")
    df_results.head()

    # Load the taxonomic data and merge with results.
    df_info = garden.load_taxonomy(data_folder)
    df_results.rename(columns={"label": "Label"}, inplace=True)
    df_all = garden.merge_taxonomy_with_results(df_results, df_info)

    return df_info, df_all




In [3]:
# Prepare the data. For this section, instead of executing the model, we will use CSV files containing the results of an already executed run of the model.
df_info, df_all = load_data(DATASETS_DIR)


102 102 102


In [4]:
df_all[['model correct']]

Unnamed: 0,model correct
0,True
1,True
2,True
3,True
4,True
...,...
1019,True
1020,True
1021,True
1022,True


In [5]:
res_df = pd.DataFrame()

#test_res = df[['model correct']].to_numpy()
#test_res

for i in range(50):
    #generate 50 samples of the test data
    col_name = 'run' + str(i)
    sample_df = df_all[['model correct']].sample(n=500, replace = True, random_state = i)
    sample_df.reset_index(drop = True, inplace=True)
    sample_df.rename(columns={'model correct':col_name}, inplace=True)
    if i==0:
        res_df = sample_df.copy()
    else:
        res_df = pd.merge(res_df, sample_df, right_index=True, left_index = True)

res_df

Unnamed: 0,run0,run1,run2,run3,run4,run5,run6,run7,run8,run9,...,run40,run41,run42,run43,run44,run45,run46,run47,run48,run49
0,True,False,True,True,True,True,True,True,True,True,...,True,True,True,False,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,False,True,True,True
2,False,True,True,True,True,True,False,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,True,True,True,True,True,True,True,True,True,True,...,True,False,True,True,True,True,True,True,True,True
496,True,True,False,True,False,True,True,True,True,False,...,True,True,True,True,True,True,True,True,True,True
497,True,True,True,True,True,False,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
498,True,True,True,True,True,True,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True


In [6]:
results = stats.kruskal(res_df.run0, res_df.run1, res_df.run2, res_df.run3, res_df.run4, res_df.run5, res_df.run6, res_df.run7, res_df.run8, res_df.run9,
              res_df.run10, res_df.run11, res_df.run12, res_df.run13, res_df.run14, res_df.run15, res_df.run16, res_df.run17, res_df.run18, res_df.run19,
              res_df.run20, res_df.run21, res_df.run22, res_df.run23, res_df.run24, res_df.run25, res_df.run26, res_df.run27, res_df.run28, res_df.run29,
              res_df.run30, res_df.run31, res_df.run32, res_df.run33, res_df.run34, res_df.run35, res_df.run36, res_df.run37, res_df.run38, res_df.run39,
              res_df.run40, res_df.run41, res_df.run42, res_df.run43, res_df.run44, res_df.run45, res_df.run46, res_df.run47, res_df.run48, res_df.run49)

### Measurements

In this first example, we simply wrap the output from `accuracy_score` with a custom `Result` type to cope with the output of a third-party library that is not supported by a MLTE builtin.

In [7]:
from mlte.evidence.types.array import Array
from mlte.measurement.external_measurement import ExternalMeasurement


kruskal_measurement = ExternalMeasurement(
    "repeated results sampling", Array, stats.kruskal
)

# Evaluate.
kruskal_res = kruskal_measurement.evaluate(
    res_df.run0, res_df.run1, res_df.run2, res_df.run3, res_df.run4, res_df.run5, res_df.run6, res_df.run7, res_df.run8, res_df.run9,
    res_df.run10, res_df.run11, res_df.run12, res_df.run13, res_df.run14, res_df.run15, res_df.run16, res_df.run17, res_df.run18, res_df.run19,
    res_df.run20, res_df.run21, res_df.run22, res_df.run23, res_df.run24, res_df.run25, res_df.run26, res_df.run27, res_df.run28, res_df.run29,
    res_df.run30, res_df.run31, res_df.run32, res_df.run33, res_df.run34, res_df.run35, res_df.run36, res_df.run37, res_df.run38, res_df.run39,
    res_df.run40, res_df.run41, res_df.run42, res_df.run43, res_df.run44, res_df.run45, res_df.run46, res_df.run47, res_df.run48, res_df.run49
)

# Inspect values
print(kruskal_res)

# Save to artifact store
kruskal_res.save(force=True)

KruskalResult(statistic=np.float64(38.903978816768415), pvalue=np.float64(0.8487546281441082))


ArtifactModel(header=ArtifactHeaderModel(identifier='evidence.repeated results sampling', type='evidence', timestamp=1759166183, creator=None, level='version'), body=EvidenceModel(artifact_type=<ArtifactType.EVIDENCE: 'evidence'>, metadata=EvidenceMetadata(test_case_id='repeated results sampling', measurement=MeasurementMetadata(measurement_class='mlte.measurement.external_measurement.ExternalMeasurement', output_class='mlte.evidence.types.array.Array', additional_data={'function': 'scipy.stats._stats_py.kruskal'})), evidence_class='mlte.evidence.types.array.Array', value=ArrayValueModel(evidence_type=<EvidenceType.ARRAY: 'array'>, data=[np.float64(38.903978816768415), np.float64(0.8487546281441082)])))