# Evaluate ALM End to End

## 0. Scope
 * Collects Predictions from Observability Classifier
 * Prepares LFB outputs (re-indexes, formats as logits, joins with prior and calibrates)
 * Evaluates Each component.
 
Note, that the script assumes that the Observability Classifier outputs cover all admissable samples.
 
### 0.1 Requires

#### Targets
 * `SNIPPET_LIST`: As generated by `Data/Build_Behaviour_Data.ipynb`
 * `GROUNDTRUTHS`: As generated by `Data/Build_Behaviour_Data.ipynb`
 * `BC_FITTING`: (Optional) For visualising distribution of behaviours.
 
#### Classifications
 * `OBSERVE_FEATS`: Raw Observability Features (as per `Observability/Extract_Observability_Data.ipynb`)
 * `BEHAVIOUR_LFB`: LFB Probability Outputs (raw csv from Cluster)
 * `BC_MODELS`: The `Prior.jlib` and `Calibrator.jlib` models for the Behaviour Classifier
 * `AVA_FORMAT`: The LFB AVA data (for joining LFB outputs with) (as per `Data/Extract_AVA_DataFormat.ipynb`)
 

 ***Note***: The script is probably best run in parts: this is because, the Observability Classifications are needed to generate the AVA Data-format for the Behaviour LFB outputs.

In [None]:
from mpctools.extensions import utils, npext, mplext # , skext, pdext, 
from IPython.display import display, HTML
from sklearn.dummy import DummyClassifier
from sklearn import metrics as skmetrics
from matplotlib import pyplot as plt
from collections import defaultdict
import scipy.stats as scstats
import pandas as pd
import numpy as np
import joblib
import sys
import os

# Add the Project Directories to the path
sys.path.append('../../../../')

# Add specific project tools
from Scripts.Constants import Const
from Tools.Parsers import BORISParser

# Finally Display Options
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
# ========= Data Formats ========= #
ID2MSE = {0: 'R', 1: 'G', 2: 'B'}

# ========= Paths ========= #
# ---- Working Directory ---- #
BASE_PATH = os.path.join(Const['Results.Scratch'], 'End2End')

# ---- Summaries ---- #
SNIPPET_LIST = os.path.join(Const['Results.Summary'], 'MRC_Q1_Behaviour_Summary.df')
GROUNDTRUTHS = os.path.join(Const['Results.Summary'], 'MRC_Q1_Behaviour_Annotations.df')

BC_FITTING = os.path.join(BASE_PATH, 'Data', 'Statistical', 'GT.Tune.df')

# ---- Data/Models ---- #
OC_MODELS = os.path.join(Const['Data.Models'], 'ALM', 'OC')
OBSERVE_FEATS = 'PATH'

BC_MODELS = os.path.join(Const['Data.Models'], 'ALM', 'BC')
BEHAVIOUR_LFB = os.path.join(BASE_PATH, 'Data', 'LFB')
AVA_FORMAT = 'PATH'

# ---- Output ---- #
OC_TEMPORARY = os.path.join(BASE_PATH, 'Data', 'OC'); utils.make_dir(OC_TEMPORARY)
CLASSIFICATIONS = os.path.join(BASE_PATH, 'Data', 'ALM'); utils.make_dir(CLASSIFICATIONS)
SCORES = os.path.join(BASE_PATH, 'Scores'); utils.make_dir(SCORES)
FIGURES = os.path.join(BASE_PATH, 'Figures'); utils.make_dir(FIGURES)

# ========= Pre-Computations ========= #
BEHAVIOUR_LIST = [f'BC.{b}' for b in BORISParser.BEHAVIOURS(True, True).values()]
BEHAVIOUR_LBLS = np.arange(1, 8)

# ========= Execution ========= #
RESOLVE_DATA = False
EVALUATE = True

In [None]:
# ========= Functions ========= #
def evaluate_observability(y_true, y_pred):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', zero_division=0),
        'W': ((y_true==1) & (y_pred==0)).sum(),
        'W (/Obs)': ((y_true==1) & (y_pred==0)).sum() / (y_true==1).sum(),
        'U': ((y_true==0) & (y_pred==1)).sum(),
        'U (/NObs)': ((y_true==0) & (y_pred==1)).sum() / (y_true==0).sum(),
    }

def evaluate_behaviour(y_true, y_pred, y_prob):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', labels=BEHAVIOUR_LBLS, zero_division='warn'),
        'LL': skmetrics.log_loss(y_true, y_prob, labels=BEHAVIOUR_LBLS),
    }

def count_run_lengths(grp):
    run_lengths = defaultdict(list)
    for r, e in zip(*npext.run_lengths(grp, how='A', return_values=True)):
        run_lengths[e].append(r)
    for e, r in run_lengths.items():
        run_lengths[e] = pd.value_counts(r)
    return pd.DataFrame(run_lengths)

## 1. Load and Prepare Data

### 1.1 Load and Split Ground-Truths

In [None]:
if RESOLVE_DATA:
    # Load the Ground-Truths
    gts = pd.read_pickle(GROUNDTRUTHS, compression='bz2').stack(0)
    gts = gts.loc[gts['GT.Admissible'], ['GT.Behaviour', 'GT.Observable', 'GT.Source', 'GT.Ann.Behaviour']]
    gts['GT.Observable'] = (gts['GT.Observable'] / 2).astype(int) # Map to 0/1 to be same as Classification
    
    # Join with DataSet information
    ds_info = pd.read_pickle(SNIPPET_LIST, compression='bz2')['DataSet.Fixed'].rename('DS.Fixed').fillna('Test')
    gts = pd.concat([gts.join(ds_info.map({'Train': 'Tune', 'Validate': 'Tune', 'Test': 'Test'}))], axis=1, keys=['Target'])
    
    # Create as DataSet specific
    data = {grp: d.drop(columns=('Target', 'DS.Fixed')) for grp, d in gts.groupby(by=('Target', 'DS.Fixed'))}    

### 1.2 Load and Join Observability

In [None]:
if RESOLVE_DATA:
    # Load the Observability Models
    normaliser = joblib.load(os.path.join(OC_MODELS, 'FeatureXtract.jlib'))
    ob_clf = joblib.load(os.path.join(OC_MODELS, f'ObserveClassify.jlib'))
    for ds in ('Tune', 'Test'):
        # Load Features & classify
        raw = pd.read_pickle(os.path.join(OBSERVE_FEATS, f'{ds}.fix.df'), compression='bz2')
        feats = normaliser.transform(raw['Features'])
        obs = pd.DataFrame(ob_clf.predict(feats), index=raw.index, columns=['OC.Observe'])
        # Store temporarily for AVA format generation
        if OC_TEMPORARY is not None:
            obs.to_pickle(os.path.join(OC_TEMPORARY, f'{ds}.df'), compression='bz2')
        # Join with GroundTruths
        data[ds] = data[ds].join(pd.concat([obs], axis=1, keys=['ALM']), how='left')
        # Check that all ok
        assert data[ds].notna().all().all(), 'Found Missing Data'

### 1.3 Load and Join Behaviour

In [None]:
if RESOLVE_DATA:
    # Load the Behaviour Models
    prior = joblib.load(os.path.join(BC_MODELS, 'Prior.jlib'))
    bc_cal = joblib.load(os.path.join(BC_MODELS, 'LFBCalibrator.jlib'))
    # Iterate over datasets
    for ds in ('Tune', 'Test'):
        # Load the AVA Source for Identity
        ava = pd.read_csv(os.path.join(AVA_FORMAT, f'{ds}', 'AVA.Behaviours.csv'), header=None, names=['Video', 'BTI', 0, 1, 2, 3, 'Dummy', 'Mouse'])
        ava = ava.drop(columns=['Video', 'Dummy']).join(ava['Video'].str.split('_', expand=True).astype(int).rename(columns={0: 'CageID', 1: 'Segment', 2: 'Snippet'}))
        ava = ava.set_index(['CageID', 'Segment', 'Snippet', 'BTI', 0, 1, 2, 3])
        ava['Mouse'] = ava['Mouse'].map(ID2MSE)
        # Load LFB
        lfb = pd.read_csv(os.path.join(BEHAVIOUR_LFB, f'{ds}.csv'), header=None, names=['Video', 'BTI', 0, 1, 2, 3, 'Behaviour', 'Score'])
        lfb = lfb.drop(columns=['Video']).join(lfb['Video'].str.split('_', expand=True).astype(int).rename(columns={0: 'CageID', 1: 'Segment', 2: 'Snippet'}))
        lfb = lfb.set_index(['CageID', 'Segment', 'Snippet', 'BTI', 0, 1, 2, 3, 'Behaviour']).unstack(-1).droplevel(0, axis=1)
        lfb = lfb.join(ava).set_index('Mouse', append='True').reset_index([0, 1, 2, 3], drop=True)
        # Resolve Behaviour
        #  - First for which we have LFB
        logits = pd.DataFrame(npext.invert_softmax(np.clip(lfb.to_numpy(), 1e-7, 1.0)), index=lfb.index, columns=BEHAVIOUR_LIST)
        bc_lfb = bc_cal.predict_proba(logits)
        #  - Then the prior: only on Observed!
        obs = data[ds][data[ds][('ALM', 'OC.Observe')] == 1]
        bc_prior = pd.DataFrame(prior.predict_proba(np.empty(len(obs) - len(lfb))), index=obs.index.difference(lfb.index), columns=BEHAVIOUR_LIST)
        #  - Now Join
        bc = pd.concat([bc_lfb, bc_prior])
        bc['BC.Behaviour'] = pd.Series(np.argmax(bc.to_numpy(), axis=1) + 1, index=bc.index)
        # Join with GroundTruths
        data[ds] = data[ds].join(pd.concat([bc], axis=1, keys=['ALM']), how='left')

### 1.4 Store

In [None]:
if RESOLVE_DATA:
    for ds, df in data.items():
        df.to_pickle(os.path.join(CLASSIFICATIONS, f'{ds}.df'), compression='bz2')

## 2. Evaluate Models

### 2.1 Load Data

In [None]:
if EVALUATE:
    # Load Data
    data = {ds: pd.read_pickle(os.path.join(CLASSIFICATIONS, f'{ds}.df'), compression='bz2') for ds in ('Tune', 'Test')}

### 2.2 Train and Generate Data for Baselines

In [None]:
if EVALUATE:
    # Train Models on Tuning Data
    train = data['Tune']['Target']
    oc_clf = DummyClassifier(strategy='prior').fit(None, train['GT.Observable'])
    bc_clf = DummyClassifier(strategy='prior').fit(None, train.loc[(train['GT.Observable'] == 1) & (train['GT.Source'] == 'A'), 'GT.Behaviour'])
    # Generate Baseline Predictions on each dataset:
    for ds in data.keys():
        df = data[ds]['Target']
        obs = pd.Series(oc_clf.predict(df[[]]), index=df.index, name='OC.Observe')
        lbl = pd.DataFrame(bc_clf.predict(df.loc[obs == 1, []]), index=df.loc[obs == 1,].index, columns=['BC.Behaviour'])
        beh = pd.DataFrame(bc_clf.predict_proba(df.loc[obs == 1, []]), index=df.loc[obs == 1,].index, columns=BEHAVIOUR_LIST)
        data[ds] = data[ds].join(pd.concat([obs.to_frame().join(lbl.join(beh))], axis=1, keys=['Prior']))    

### 2.3 Evaluate Observability

#### 2.3.1 Statistics

In [None]:
if EVALUATE:
    oc_count = pd.DataFrame({nm: ds[('Target', 'GT.Observable')].value_counts() for nm, ds in data.items()})
    oc_stats = pd.DataFrame({nm: (ds[('Target', 'GT.Observable')].value_counts(normalize=True)*100).round(1) for nm, ds in data.items()})
    oc_stats = oc_stats.rename({0: 'Not Obs.', 1: 'Observable'}).T
    oc_stats['Total'] = oc_count.T.sum(axis=1)
    display(oc_stats)

#### 2.3.2 Overall Counts

In [None]:
if EVALUATE:
    # Generate
    oc_scores = {}
    for ds, df in data.items():
        y_gt = df[('Target', 'GT.Observable')].to_numpy()
        oc_scores[ds] = {}
        for mdl, preds in df.drop(columns=['Target'], level=0).groupby(axis=1, level=0):
            oc_scores[ds][mdl] = evaluate_observability(y_gt, preds[(mdl, 'OC.Observe')].to_numpy())
        oc_scores[ds] = pd.DataFrame(oc_scores[ds])
    oc_scores = pd.concat(oc_scores)[['Prior', 'ALM']].T
    oc_scores.to_pickle(os.path.join(SCORES, 'Scores.OC.df'), compression='bz2')
    
    # Display and Print as Latex
    display(oc_scores)
    latex = oc_scores.stack(0)
    latex['U (% NObs)'] = latex['U'].astype(int).astype(str) + ' (' + (latex['U (/NObs)'] * 100).round(1).astype(str) + ')'
    latex['W (% Obs)'] = latex['W'].astype(int).astype(str) + ' (' + (latex['W (/Obs)'] * 100).round(1).astype(str) + ')'
    latex = latex.drop(columns=['U', 'W', 'U (/NObs)', 'W (/Obs)']).unstack(-1).reorder_levels((1, 0), axis=1).loc[['Prior', 'ALM'], ['Tune', 'Test']]
    print(latex.to_latex(float_format='%.2f', multicolumn_format='c'))

#### 2.3.3 Sequences of Mistakes.

I will show this only on the Testing set, but will compare with the true runs of observables.

In [None]:
if EVALUATE:
    # Prepare
    test = data['Test']
    
    # First Observability
    #   - Compute Observability
    obs = test[('Target', 'GT.Observable')].map({0: 'Not Observable', 1: 'Observable'})
    obs = obs.groupby(level=(0, 1, 2, 4)).apply(count_run_lengths).unstack(-1).sum().unstack()
    obs = obs.T * 100 / obs.T.sum()
    grouped = obs.loc[11:115].groupby((obs.loc[11:115].index - 1) // 5).sum()
    grouped = grouped.rename(lambda ix: f'[{ix*5+1}:{(ix+1)*5}]')
    grouped = pd.concat([obs.loc[:10], grouped, obs.loc[116:]])
    #   - Plot
    fig, ax = plt.subplots(1, 1, figsize=[18, 6], tight_layout=True)
    grouped.plot.bar(ax=ax, width=0.85)
    ax.tick_params(labelsize=19); ax.set_xlabel('Interval Length (s)', fontsize=22); ax.set_ylabel('Fraction (%)', fontsize=22); ax.legend(fontsize=22, loc=9)
    plt.savefig(os.path.join(FIGURES, 'fig_beh_e2e_intervals.jpg'), bbox_inches='tight', dpi=200)
    
    # Now Errors/Classifications
    #   - Compute and Map Classifications
    errors = (test[('ALM', 'OC.Observe')] * 10 + test[('Target', 'GT.Observable')]).map({0: 'Correct', 1: 'Wasteful', 10: 'Unreliable', 11: 'Correct'})
    errors = errors.groupby(level=(0, 1, 2, 4)).apply(count_run_lengths).unstack(-1).sum().unstack()
    errors = errors.T * 100 / errors.T.sum()
    grouped = errors.loc[11:115].groupby((errors.loc[11:115].index - 1) // 5).sum()
    grouped = grouped.rename(lambda ix: f'[{ix*5+1}:{(ix+1)*5}]')
    grouped = pd.concat([errors.loc[:10], grouped, errors.loc[116:]])
    #   - Plot
    fig, ax = plt.subplots(1, 1, figsize=[18, 6], tight_layout=True)
    grouped.plot.bar(ax=ax, width=0.85)
    ax.tick_params(labelsize=19); ax.set_xlabel('Interval Length (s)', fontsize=22); ax.set_ylabel('Fraction (%)', fontsize=22); ax.legend(fontsize=22, loc=9)
    plt.savefig(os.path.join(FIGURES, 'fig_beh_e2e_oc_intervals.jpg'), bbox_inches='tight', dpi=200)

#### 2.3.4 Distribution of Behaviours in Wasteful

In [None]:
if EVALUATE:
    # Prepare
    test = data['Test']; wasted = ((test[('Target', 'GT.Observable')] == 1) & (test[('ALM', 'OC.Observe')] == 0))
    
    # === First the comparison between wasted and all === #
    # ---- Evaluate ---- #
    beh_waste = test.loc[wasted, ('Target', 'GT.Behaviour')].value_counts(normalize=True).sort_index().rename('Within Wasted') * 100
    beh_total = test[('Target', 'GT.Behaviour')].value_counts(normalize=True).drop(0).sort_index().rename('Within All Samples') * 100
    # ---- Plot ---- #
    fig, ax = plt.subplots(1, 1, figsize=[11, 6], tight_layout=True)
    beh_waste.to_frame().join(beh_total).plot.bar(ax=ax, width=0.8, fontsize=18)
    ax.legend(fontsize=18); ax.set_xlabel('Behaviour', fontsize=22); ax.set_ylabel('Fraction of Total (%)', fontsize=22); ax.set_xlim([-0.5, 6.5]);
    ax.set_xticklabels(BORISParser.BEHAVIOURS(simplified=True, shorthand=True).values(), rotation=0, ha='center')
    # ---- Save Figure ---- #
    plt.savefig(os.path.join(FIGURES, 'fig_beh_e2e_oc_wasted_behs.jpg'), bbox_inches='tight', dpi=300)
    
    # === Now the Fraction of the Behaviour itself === #
    # ---- Evaluate ---- #
    beh_perc = test.loc[wasted, ('Target', 'GT.Behaviour')].value_counts() * 100 / test[('Target', 'GT.Behaviour')].value_counts().drop(0)
    nominal = wasted[test[('Target', 'GT.Behaviour')] > 0].mean() # Need to consider only non-hidden!
    # ---- Plot ---- #
    fig, ax = plt.subplots(1, 1, figsize=[7, 6], tight_layout=True)
    beh_perc.rename('% of Behaviour').plot.bar(ax=ax, width=0.7, color='green', fontsize=18)
    ax.axhline(nominal*100, ls='--', color='k', label=f'Nominal = {nominal*100:.1f}%')
    ax.legend(fontsize=18)
    ax.set_xlabel('Behaviour', fontsize=22); ax.set_ylabel('Fraction of Behaviour (%)', fontsize=22); ax.set_ylim(0, 23)
    ax.set_xticklabels(BORISParser.BEHAVIOURS(simplified=True, shorthand=True).values(), rotation=0, ha='center')
    # ---- Save Figure ---- #
    plt.savefig(os.path.join(FIGURES, 'fig_beh_e2e_oc_wasted_behs_frac.jpg'), bbox_inches='tight', dpi=300)

### 2.4 Evaluate Behaviour

#### 2.4.1 Statistics

##### 2.4.1.1 Raw Counts

In [None]:
if EVALUATE:
    bc_stats = pd.concat({
        nm: pd.DataFrame({'Ann' if src == 'A' else 'BG': grp[('Target', 'GT.Behaviour')].value_counts().drop(0) for src, grp in ds.groupby(by=[('Target', 'GT.Source')])})
        for nm, ds in data.items()
    })
    bc_stats['All'] = bc_stats.sum(axis=1)
    bc_stats = bc_stats.unstack().stack(0)
    bc_stats['Total'] = bc_stats.sum(axis=1)
    display(bc_stats)

##### 2.4.1.2 Across Data splits

In [None]:
if EVALUATE:
    # Compute
    _tune = data['Tune'][('Target', 'GT.Behaviour')]
    _test = data['Test'][('Target', 'GT.Behaviour')]
    _fit = pd.read_pickle(BC_FITTING, compression='bz2')
    _dist = pd.concat([
        _fit.groupby('DataSet.Fixed')['GT.Behaviour'].value_counts(normalize=True).unstack(0).rename_axis(None).rename_axis(None, axis=1).T,
        _tune[_tune > 0].value_counts(normalize=True).to_frame('Tune').T,
        _test[_test > 0].value_counts(normalize=True).to_frame('Test').T
    ]).T * 100
    # Plot
    fig, ax = plt.subplots(1, 1, figsize=(15, 5), tight_layout=True)
    _dist.plot.bar(ax=ax, width=0.85, fontsize=18)
    ax.set_xticklabels(
        [f'{l}\n({p*100:.1f}%)' for l, p in zip(BORISParser.BEHAVIOURS(simplified=True, shorthand=True).values(), _fit['GT.Behaviour'].value_counts(normalize=True).sort_index())], 
        rotation=0, ha='center')
    plt.xlabel('Behaviour', fontsize=20); plt.ylabel('Fraction (%)', fontsize=20)
    # Sort out Legend
    lgn = ax.get_legend_handles_labels()
    l1 = ax.legend(lgn[0][:2], lgn[1][:2], title='Model Fit', loc=[0.72, 0.645], fontsize=18, title_fontsize=20); 
    l2 = ax.legend(lgn[0][2:], lgn[1][2:], title='End to End', loc=1, fontsize=18, title_fontsize=20); ax.add_artist(l1)
    plt.savefig(os.path.join(FIGURES, 'fig_beh_behaviours_distribution.png'), bbox_inches='tight', dpi=300)
    
    # Show also Abslute number of Behaviours for Training
    display(_fit.groupby('DataSet.Fixed')['GT.Behaviour'].value_counts(normalize=False))

##### 2.4.1.3 Across Annotator/Best Guess

In [None]:
if EVALUATE:
    # Compute
    bc_pct = (bc_stats.drop(columns=['Total']) / bc_stats[['Total']].to_numpy()).drop('All', level=1)*100
    bc_pct = bc_pct.reorder_levels((1, 0)).sort_index()
    bc_pct = bc_pct.rename(columns=BORISParser.BEHAVIOURS(True, True)).reindex(['Tune', 'Test'], level=1, axis=0).T
    # Plot
    fig, ax = plt.subplots(1, 1, figsize=(15, 5), tight_layout=True)
    bc_pct.plot.bar(ax=ax, width=0.85, fontsize=18); plt.xticks(rotation=0)
    plt.xlabel('Behaviour', fontsize=20); plt.ylabel('Fraction (%)', fontsize=20)
    # Sort out Legend
    lgn = ax.get_legend_handles_labels()
    l1 = ax.legend(lgn[0][:2], ['Tune', 'Test'], title='Phenotyper', loc=[0.72, 0.67], fontsize=18, title_fontsize=20); 
    l2 = ax.legend(lgn[0][2:], ['Tune', 'Test'], title='Best-Guess', loc=1, fontsize=18, title_fontsize=20); ax.add_artist(l1)
    plt.savefig(os.path.join(FIGURES, 'fig_beh_behaviours_ann_v_guess.png'), bbox_inches='tight', dpi=300)

#### 2.4.2 Classification Scores when True Observables.

Note that to have a fair comparison, the concept of True-Observable depends on the Model itself, since we are evaluating End-to-End! Hence the number of samples over which the statistics are calculated differ in this case.

In [None]:
if EVALUATE:
    # Evaluate
    bc_scores = defaultdict(lambda : defaultdict(dict))
    bc_sizes = defaultdict(dict)
    for ds, df in data.items():
        for mdl in ('Prior', 'ALM'):
            mdl_df = df[(df[('Target', 'GT.Observable')] == 1) & (df[(mdl, 'OC.Observe')])] # Select Data
            _sizes = {}
            # Group by Source
            for src, src_df in mdl_df.groupby(by=('Target', 'GT.Source')):
                src = {'A': 'Annotator', 'M': 'Best-Guess'}[src]
                y_true = src_df[('Target', 'GT.Behaviour')].to_numpy()
                y_pred = src_df[(mdl, 'BC.Behaviour')].to_numpy()
                y_prob = src_df[mdl].drop(columns=['OC.Observe', 'BC.Behaviour'])
                bc_scores[mdl][ds][src] = evaluate_behaviour(y_true, y_pred, y_prob)
                _sizes[src] = len(y_true)
            # Now also do combined
            y_true = mdl_df[('Target', 'GT.Behaviour')].to_numpy()
            y_pred = mdl_df[(mdl, 'BC.Behaviour')].to_numpy()
            y_prob = mdl_df[mdl].drop(columns=['OC.Observe', 'BC.Behaviour'])
            bc_scores[mdl][ds]['Combined'] = evaluate_behaviour(y_true, y_pred, y_prob)
            _sizes['Combined'] = len(y_true)
            # Create as DataFrame(s)
            bc_scores[mdl][ds] = pd.DataFrame(bc_scores[mdl][ds]).stack()
            bc_sizes[mdl][ds] = pd.Series(_sizes)
    bc_scores = pd.concat([pd.concat(bcs) for bcs in bc_scores.values()], axis=1, keys=bc_scores.keys()).reorder_levels((0, 2, 1)).sort_index().T
    bc_scores.to_pickle(os.path.join(SCORES, 'Scores.BC.df'), compression='bz2')
    bc_sizes = pd.concat([pd.concat(bcs) for bcs in bc_sizes.values()], axis=1, keys=bc_sizes.keys()).T
    bc_sizes.to_pickle(os.path.join(SCORES, 'Sizes.BC.df'), compression='bz2')
    
    # Display and Latex
    display(bc_scores, bc_sizes)
    print(bc_scores.stack(0).reorder_levels((1, 0)).loc[['Tune', 'Test'],].to_latex(float_format='%.2f', multicolumn_format='c', multirow=True))
    print(bc_sizes.to_latex(float_format='%.0f', multicolumn_format='c'))

#### 2.4.3 Confusion Matrices when True Observables

In this case I will focus on the ALM, but show for A/M subsets on TEST set only.

In [None]:
if EVALUATE:
    # Select Data
    alm_df = data['Test'][(data['Test'][('Target', 'GT.Observable')] == 1) & (data['Test'][('ALM', 'OC.Observe')])]
    beh_axis = BORISParser.BEHAVIOURS(True, True).values()
    
    # Iterate over Sources
    for src, src_df in alm_df.groupby(by=('Target', 'GT.Source')):
        # Compute
        y_true = src_df[('Target', 'GT.Behaviour')].to_numpy()
        y_pred = src_df[('ALM', 'BC.Behaviour')].to_numpy()
        conf = skmetrics.confusion_matrix(y_true, y_pred, labels=BEHAVIOUR_LBLS, normalize=None)
        # Plot as Hinton
        fig, ax = plt.subplots(1, 1, figsize=[8, 8], tight_layout=True)
        mplext.plot_matrix(npext.sum_to_one(conf, axis=1), mode='hinton', x_labels=beh_axis, y_labels=beh_axis, x_rot=35, ax=ax, fs=22, buffer=0.6)
        ax.set_xlabel('Classified', fontsize=25); ax.set_ylabel('Groundtruth', fontsize=25)
        plt.savefig(os.path.join(FIGURES, f'fig_beh_e2e_bc_confusion_src={src}.jpg'), bbox_inches='tight', dpi=300)

#### 2.4.4 Distribution of Predictions under Unreliable

In [None]:
if EVALUATE:
    # Prepare
    test = data['Test']; unreliable = ((test[('Target', 'GT.Observable')] == 0) & (test[('ALM', 'OC.Observe')] == 1));
    test_unrel = test.loc[unreliable, ('ALM', 'BC.Behaviour')]
    
    # Evaluate and construct as DataFrame
    bc_unrel = pd.concat([test_unrel.value_counts(normalize=False), test_unrel.value_counts(normalize=True)*100], axis=1, keys=['Frequency', '(%)']).sort_index()
    bc_unrel = bc_unrel.rename(BORISParser.BEHAVIOURS(simplified=True, shorthand=True)).T
    bc_unrel.to_pickle(os.path.join(SCORES, 'Distribution.Unreliables.df'), compression='bz2')
    
    # Display and Print
    display(bc_unrel)
    print(bc_unrel.to_latex(float_format='%.1f'))