### Initialize MLTE Context

MLTE contains a global context that manages the currently active _session_. Initializing the context tells MLTE how to store all of the artifacts that it produces. This import will also set up global constants related to folders and model to use.

In [30]:
# Sets up context for the model being used, sets up constants related to folders and model data to be used.
from demo.scenarios.session import *

### Helper Functions
General functions and external imports.

In [31]:
# General functions.

from demo.scenarios import garden
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import f_oneway

def load_data(data_folder: str, data_file: str):
    """Loads all garden data results and taxonomy categories."""
    df_results = garden.load_base_results(data_folder, data_file) 
    df_results.head()

    # Load the taxonomic data and merge with results.
    df_info = garden.load_taxonomy(data_folder)
    df_results.rename(columns={"label": "Label"}, inplace=True)
    df_all = garden.merge_taxonomy_with_results(df_results, df_info)

    return df_info, df_all

def load_results(data_folder: str):
    """loads reproducabilty test result runs"""
    #my_folder = data_folder + 
    df_results = pd.read_csv(path.join(data_folder, 'ReproducibilityDataSet_CV.csv'))

    return df_results
    #df_results = garden.load_base_results(data_folder,)


In [32]:
# Prepare the data. For this section, instead of executing the model, we will use CSV files containing the results of an already executed run of the model.
#df_info, df_all = load_data(DATASETS_DIR)

#df = pd.read_csv('ReproducibilityDataSet_CV.csv')

df_info, df_test = load_data(DATASETS_DIR, "predictions_test.csv")
df_info, df_new = load_data(DATASETS_DIR, "predictions_dall-e-2.csv")
df_test['dataset'] = 'DALL-E-2'
df_new['dataset'] = 'Test'
df_all = pd.concat([df_new, df_test], ignore_index=True)

102 102 102
102 102 102


In [33]:
df_all.columns

Index(['Label', 'predicted_label', 'label_prob', 'model correct', 'model_file',
       'model_hash', 'Unnamed: 0', 'Group', 'Count', 'Common Name',
       'Other Name', 'Risk', 'Kingdom', 'Phylum', 'Clade1', 'Clade2', 'Clade3',
       'Order', 'Family', 'Subfamily', 'Genus', 'Label Name', 'dataset'],
      dtype='object')

In [34]:
valid_labels = df_all.groupby(['Label', 'dataset']).size().unstack().index.tolist()

In [35]:
df_all.Label

0         0
1         0
2         0
3         0
4         0
       ... 
2177    101
2178    101
2179    101
2180    101
2181    101
Name: Label, Length: 2182, dtype: int64

In [43]:
def run_anova_for_label(df, label):
    # Perform ANOVA for a specific label
    subset = df[df['Label'] == label]
    test_vals = subset[subset['dataset'] == 'Test']['label_prob']
    dalle_vals = subset[subset['dataset'] == 'DALL-E-2']['label_prob']

    f_stat, p_val = f_oneway(test_vals, dalle_vals)

    return {
        'label': label,
        'f_stat': f_stat,
        'p_val': p_val,
    }

def run_anova(df_all):
    anova_results = [run_anova_for_label(df_all, label) for label in valid_labels]
    results_df = pd.DataFrame(anova_results)
    results_df.sort_values(by='label', inplace=True)
    results_df.set_index('label', inplace = True)
    return results_df

def run_anova2(df_all):
    res_df = run_anova(df_all)
    
    return res_df.to_numpy()

In [44]:
# Run ANOVA
#print(f"Running ANOVA on {len(valid_labels)} labels...")
#anova_results = [run_anova_for_label(df_all, label) for label in valid_labels]

#results_df = pd.DataFrame(anova_results)
results_df = run_anova(df_all)
#results_df['significant'] = results_df['p_val'] < 0.05
results_df

Unnamed: 0_level_0,f_stat,p_val
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35.980582,0.000007
1,1.282398,0.271543
2,4.059687,0.058292
3,0.000383,0.984583
4,26.633249,0.000056
...,...,...
97,13.678314,0.001525
98,32.823478,0.000016
99,3.832493,0.065120
100,22.057126,0.000157


In [49]:
run_anova(df_all)

Unnamed: 0_level_0,f_stat,p_val
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,35.980582,0.000007
1,1.282398,0.271543
2,4.059687,0.058292
3,0.000383,0.984583
4,26.633249,0.000056
...,...,...
97,13.678314,0.001525
98,32.823478,0.000016
99,3.832493,0.065120
100,22.057126,0.000157


### Measurements

In this first example, we simply wrap the output from `accuracy_score` with a custom `Result` type to cope with the output of a third-party library that is not supported by a MLTE builtin.

In [60]:
from mlte.evidence.types.array import Array
from mlte.measurement.external_measurement import ExternalMeasurement
from demo.scenarios.evidence.multiple_ranksums import MultipleRanksums


def calculate_multiple_anova(df_all):
    evid: List = []
    #print(df_all.columns)

    labels = df_all.Label.unique()

    for lab in labels:

        subset = df_all[df_all['Label'] == lab]
        test_vals = subset[subset['dataset'] == 'Test']['label_prob']
        dalle_vals = subset[subset['dataset'] == 'DALL-E-2']['label_prob']
        
        
        #f_oneway(test_vals, dalle_vals)

        anova_measurement = ExternalMeasurement(
            f"label {lab}",
            Array,
            f_oneway,
        )
        anova: Array = anova_measurement.evaluate(
            test_vals,
            dalle_vals,
        )
        #print(f"blur {blurs[i]}: {ranksum}")
                
        evid.append({anova.identifier: anova.array})
    return evid


    
multiple_anova_meas = ExternalMeasurement(
    "running in new domain",
    MultipleRanksums,
    calculate_multiple_anova,
)
multiple_anova: MultipleRanksums = multiple_anova_meas.evaluate(
    df_all
)

multiple_anova.save(force=True)


###


#multiple_ranksums_meas = ExternalMeasurement(
#    f"effect of blur across families",
#    MultipleRanksums,
#    calculate_multiple_ranksums,
#)
#multiple_ranksums: MultipleRanksums = multiple_ranksums_meas.evaluate(
#    df_all, pops, blurs
#)
#multiple_ranksums.num_pops = len(pops)
#multiple_ranksums.save(force=True)

ArtifactModel(header=ArtifactHeaderModel(identifier='evidence.running in new domain', type='evidence', timestamp=1759188665, creator=None, level='version'), body=EvidenceModel(artifact_type=<ArtifactType.EVIDENCE: 'evidence'>, metadata=EvidenceMetadata(test_case_id='running in new domain', measurement=MeasurementMetadata(measurement_class='mlte.measurement.external_measurement.ExternalMeasurement', output_class='demo.scenarios.evidence.multiple_ranksums.MultipleRanksums', additional_data={'function': '__main__.calculate_multiple_anova'})), evidence_class='demo.scenarios.evidence.multiple_ranksums.MultipleRanksums', value=OpaqueValueModel(evidence_type=<EvidenceType.OPAQUE: 'opaque'>, data={'array': [{'evidence.label 0': F_onewayResult(statistic=np.float64(35.98058233033483), pvalue=np.float64(7.269476178488252e-06))}, {'evidence.label 1': F_onewayResult(statistic=np.float64(1.2823981355178078), pvalue=np.float64(0.2715430886274718))}, {'evidence.label 2': F_onewayResult(statistic=np.fl