# Evaluate and Optimise Neural Models of Behaviour with Prior

## 0. Scope

This script:
 1. Compares various Activity-Detection models (LFB vs CACNF) on Old Data
 2. Evaluates LFB model on new data
 3. Explores and trains the Prior Probability model.
 
It is not concerned with Fusion-Level (GMD + LFB) analysis.

### 0.1 Requirements
 1. `GROUNDTRUTHS`: Set of Ground-truth labels according to old data schema (for re-generating Comparative Results) and the new one (as per `Extract_Behaviour_Subset.ipynb`)
 2. Old Predictions by LFB Model (`LFB.Old.<Train/Validate>`)
 3. Old Predictions by CACNF Model (`CACNF.Old.<Train/Validate>`)
 4. LFB Predictions on all DataSets (`LFB.Tuning.Fixed.<Train/Validate/Test>`)

In [None]:
# General Libraries
from mpctools.extensions import utils, mplext, npext, skext
from IPython.display import display, HTML
from sklearn.dummy import DummyClassifier
from sklearn import metrics as skmetrics
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib
import shutil
import sys
import os

# Add the Project Directories to the path
sys.path.append('../../../../')

# Add specific project tools
from Scripts.Constants import Const
from Tools.Parsers import BORISParser

# Finally Display Options
display(HTML("<style>.container { width:95% !important; }</style>"))
pd.set_option('display.max_columns', 50)

In [None]:
# === Paths === #
# ---- Local Directory ---- #
BASE_RESULTS = os.path.join(Const['Results.Scratch'], 'Behaviour')
MODEL_PRED = os.path.join(BASE_RESULTS, 'Features', 'Formatted')
GROUNDTRUTHS = os.path.join(BASE_RESULTS, 'Groundtruths')

SCORES = os.path.join(BASE_RESULTS, 'Scores')
OUTPUT = os.path.join(BASE_RESULTS, 'Predictions')
MODELS = os.path.join(BASE_RESULTS, 'Models', 'Pipeline')

# === Execution Control === #
# What to do:
COMPARE_MODELS = True
EVALUATE_LFB = True
EXPLORE_PRIOR = True

In [None]:
# === Functions === #
def compare_scores(y_true, y_pred, y_prob):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', labels=np.arange(1, 9), zero_division=1),
        'NLL': skmetrics.log_loss(y_true, y_prob, labels=np.arange(1, 9))
    }

def evaluate_lfb(y_true, y_pred, y_prob):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', labels=np.arange(1, 8), zero_division=1),
        'NLL': skmetrics.log_loss(y_true, y_prob, labels=np.arange(1, 8))
    }

## 1. First Steps

In [None]:
# Load the Data and Group
gts = {ds: pd.read_pickle(os.path.join(GROUNDTRUTHS, f'GT.{ds}.df'), compression='bz2') for ds in ('Tune', 'Test')}
gts = {'Train': gts['Tune'][gts['Tune']['DataSet.Fixed'] == 'Train'], 'Validate': gts['Tune'][gts['Tune']['DataSet.Fixed'] == 'Validate'], 'Tune': gts['Tune'], 'Test': gts['Test']}

# Load old Data
old = pd.read_pickle(os.path.join(GROUNDTRUTHS, 'GT.Old.Tune.df'), compression='bz2')
old = {'Train': old.loc[old['DataSet.Fixed'] == 'Train', 'GT.Behaviour'], 'Validate': old.loc[old['DataSet.Fixed'] == 'Validate', 'GT.Behaviour']}

# Provide quick access to targets
y_all = {ds: gt[['GT.Behaviour']].astype(int) for ds, gt in gts.items()}
y_lfb = {ds: gt.loc[gt['TIM.Det'], ['GT.Behaviour']].astype(int) for ds, gt in gts.items()}
y_nod = {ds: gt.loc[~gt['TIM.Det'], ['GT.Behaviour']].astype(int) for ds, gt in gts.items()}

# Create some Directories
utils.make_dir(SCORES); utils.make_dir(OUTPUT); utils.make_dir(MODELS)

## 2. Re-Generate Old Results

This section attempts to regenerate the necessary tables/figures from the old data.

### 2.1 Model Comparison

In [None]:
if COMPARE_MODELS:
    # Train Prior Classifier (the others just use pre-computed values)
    prior_mdl = DummyClassifier(strategy='prior').fit(None, old['Train'].values)
    
    # Score
    comparative_scores = defaultdict(dict)
    for ds, gt in old.items():
        for m_name, m_file in (('Prior', None), ('STLT', 'CACNF'), ('LFB', 'LFB')):
            if m_file is not None:
                mdl_df = gt.to_frame().join(pd.read_pickle(os.path.join(MODEL_PRED, f'{m_file}.Old.{ds}.df'), compression='bz2'))
            else:
                mdl_df = gt.to_frame().join(pd.DataFrame(prior_mdl.predict_proba(np.empty_like(gt)), columns=np.arange(1, 9), index=gt.index))
            y_true = mdl_df['GT.Behaviour'].to_numpy(int)
            y_prob = mdl_df.drop(columns=['GT.Behaviour']).to_numpy()
            y_pred = np.argmax(y_prob, axis=1) + 1
            comparative_scores[ds][m_name] = compare_scores(y_true, y_pred, y_prob)
        comparative_scores[ds] = pd.DataFrame(comparative_scores[ds])
    comparative_scores = pd.concat(comparative_scores).T
        
    # Display and Store
    comparative_scores.to_pickle(os.path.join(SCORES, 'Scores.LFB.Comparative.df'), compression='bz2')
    print(comparative_scores.to_latex(float_format="%.2f", multicolumn_format='c', bold_rows=True))

### 2.2 Calibration Curves

In [None]:
if COMPARE_MODELS:
    # Prepare
    fig, axs = plt.subplots(1, 2, figsize=[17, 7.4], tight_layout=True, sharex=True, sharey=True)
    gt = old['Validate']
    
    # Iterate over Models
    for ax, m_name, m_file in ((axs[1], 'STLT', 'CACNF'), (axs[0], 'LFB', 'LFB')):
        # Prepare Data
        mdl_df = gt.to_frame().join(pd.read_pickle(os.path.join(MODEL_PRED, f'{m_file}.Old.{ds}.df'), compression='bz2'))
        y_true = mdl_df['GT.Behaviour'].to_numpy(int) - 1
        y_prob = mdl_df.drop(columns=['GT.Behaviour']).to_numpy()
        
        # Plot
        skext.multi_class_calibration(y_true, y_prob, n_bins=8, names=['Imm', 'Feed', 'Drink', 'S-Grm', 'A-Grm', 'uMove', 'Loco', 'Other'], ax=ax)
        ax.set_xlabel(None); ax.set_ylabel(None)
        ax.tick_params(labelsize=15); ax.set_aspect('equal', 'box')
        ax.set_title(f'{m_name}', fontsize=16)
        
    # Clean Up
    axs[0].get_legend().remove(); axs[1].legend(loc=2, fontsize=15, bbox_to_anchor=(1.025, 0.5, 1.0, 0.25))
    fig.supylabel('Fraction of Positives', fontsize=15); fig.supxlabel('Mean Predicted Probability', fontsize=15)
    plt.savefig(os.path.join(BASE_RESULTS, 'Figures', 'fig_behaviour_comparison_calibration.png'), bbox_inches='tight', dpi=300)

## 3. Evaluate LFB Model

This now uses the new data.

In [None]:
if EVALUATE_LFB:
    y = utils.subdict(y_lfb, ('Train', 'Validate'))

### 3.1 Overall Comparison with Prior

In [None]:
if EVALUATE_LFB:
    # Train Dummy Model (on Training Set)
    prior_mdl = DummyClassifier(strategy='prior').fit(None, y['Train'].to_numpy())
    for ds, data in y.items():
        prior_df = pd.DataFrame(prior_mdl.predict_proba(np.empty_like(data)), columns=np.arange(1, 8), index=data.index)
        prior_df.to_pickle(os.path.join(MODEL_PRED, f'DC.Fixed.{ds}.df'), compression='bz2')
        
    # Now Evaluate
    lfb_scores = defaultdict(dict)
    for ds, gt in y.items():
        for mdl in ('DC', 'LFB'):
            mdl_df = gt.join(pd.read_pickle(os.path.join(MODEL_PRED, f'{mdl}.Fixed.{ds}.df'), compression='bz2'))
            y_true = mdl_df['GT.Behaviour'].to_numpy(int)
            y_prob = mdl_df.drop(columns=['GT.Behaviour']).to_numpy()
            y_pred = np.argmax(y_prob, axis=1) + 1
            lfb_scores[ds][mdl] = evaluate_lfb(y_true, y_pred, y_prob)
        lfb_scores[ds] = pd.DataFrame(lfb_scores[ds])
    lfb_scores = pd.concat(lfb_scores).T[['Train', 'Validate']]
    
    # Display and Store
    lfb_scores.to_pickle(os.path.join(SCORES, 'Scores.LFB.df'), compression='bz2')
    display(lfb_scores)

### 3.2 Per-Class F1-Scores

Report per-class F1-Score on each dataset

In [None]:
if EVALUATE_LFB:
    # Prepare
    per_class = []
    behaviours = list(BORISParser.BEHAVIOURS(True, True).values())
    # Iterate over DataSplits
    for ds, gt in y.items():
        lfb_df = gt.join(pd.read_pickle(os.path.join(MODEL_PRED, f'LFB.Fixed.{ds}.df'), compression='bz2'))
        y_true = lfb_df['GT.Behaviour'].to_numpy(int)
        y_pred = lfb_df.drop(columns=['GT.Behaviour']).idxmax(axis=1).to_numpy()  # idxmax uses the name of the column automatically!
        _score = skmetrics.f1_score(y_true, y_pred, labels=np.arange(1, 8), average=None, zero_division='warn')
        per_class.append(pd.DataFrame(_score, index=behaviours, columns=(ds,)).T)
    per_class = pd.concat(per_class).loc[['Train', 'Validate']]
    # Store and display
    per_class.to_pickle(os.path.join(SCORES, 'Scores.LFB.pClass.df'), compression='bz2')
    display(per_class)

### 3.3 Confusion Matrix

This is done only on the Validation-Set

In [None]:
if EVALUATE_LFB:
    # ==== First Compute Confusion Matrix on Validation-Set ==== #
    lfb_df = y['Validate'].join(pd.read_pickle(os.path.join(MODEL_PRED, f'LFB.Fixed.Validate.df'), compression='bz2'))
    y_true = lfb_df['GT.Behaviour'].to_numpy(int)
    y_pred = lfb_df.drop(columns=['GT.Behaviour']).idxmax(axis=1).to_numpy()  # idxmax uses the name of the column automatically!
    conf = skmetrics.confusion_matrix(y_true, y_pred, labels=np.arange(1, 8))
    
    # ==== Now Visualise (as separate plots) ==== #
    behaviours = list(BORISParser.BEHAVIOURS(True, True).values())
    # First Hinton Plot
    fig, ax = plt.subplots(1, 1, figsize=[9, 8], tight_layout=True)
    mplext.plot_matrix(npext.sum_to_one(conf, axis=1), mode='hinton', ax=ax, x_labels=behaviours, y_labels=behaviours, fs=18, x_rot=90)
    ax.set_ylabel('True Behaviour', fontsize=20); ax.set_xlabel('Predicted Behaviour', fontsize=20)
    plt.savefig(os.path.join(BASE_RESULTS, 'Figures', f'fig_behaviour_lfb_validate_confusion_hinton.png'), dpi=200)
    # And as Matrix
    fig, ax = plt.subplots(1, 1, figsize=[9, 8], tight_layout=True)
    mplext.plot_matrix(conf, mode='matrix', ax=ax, x_labels=behaviours, y_labels=behaviours, fs=18, x_rot=90, fmt='.0f')
    ax.set_aspect('equal')
    ax.set_ylabel('True Behaviour', fontsize=20); ax.set_xlabel('Predicted Behaviour', fontsize=20)
    plt.savefig(os.path.join(BASE_RESULTS, 'Figures', f'fig_behaviour_lfb_validate_confusion_values.png'), dpi=200)

## 4. Explore Prior Distributions.

I will now explore the prior probabilities. The choice here is whether to use the distribution overall or when it is missing (if it is not Missing at Random).

### 4.1 Statistics

Let us look at some statistics

In [None]:
if EXPLORE_PRIOR:
    # Sample Size
    print(f'Lengths: With Detection {[len(y) for y in y_lfb.values()]} | No Detection {[len(y) for y in y_nod.values()]}')
    print(f"Percentage Missing Detections: {(len(y_nod['Tune']) + len(y_nod['Test'])) * 100 / (len(y_all['Tune']) + len(y_all['Test'])):.1f}%")
    # Show Priors on Training and Validation Sets
    for ds in ('Train', 'Validate'):
        fig, ax = plt.subplots(1, 1, figsize=[9, 5], tight_layout=True)
        priors = pd.concat({ss: y[ds]['GT.Behaviour'].value_counts(normalize=True).sort_index() for ss, y in zip(('All Data', 'No Det'), (y_all, y_nod))}, axis=1).fillna(0) * 100
        priors.plot.bar(ax=ax, fontsize=22, width=0.8); ax.legend(fontsize=22, borderaxespad=0.2, borderpad=0.2)
        ax.set_xticklabels(BORISParser.BEHAVIOURS(True, True).values(), rotation=0, ha='center', fontsize=22)
        ax.set_ylabel('Fraction (%)', fontsize=22); ax.set_xlabel('Behaviour', fontsize=22)
        ax.set_ylim([0, 53])
        plt.savefig(os.path.join(BASE_RESULTS, 'Figures', f'fig_beh_behaviour_prior_{ds.lower()}.png'), dpi=300, bbox_inches='tight')

##### Comments
 1. At the outset, the number of samples with no detection is quite low (especially for the validation set), which could cause problems when estimating probabilities in such situations.
 2. At the same time, however, the trends in the distribution seem to be very robust. In going from All Data to No Detections:
     * Immobile and Feeding are under-represented
     * Allo-Grooming, Locomotion and Other are over-represented
     * Self-grooming seems stable
     * Drinking is too rare to have any impact.
 3. This suggests there may be scope to use the No-Detection data.
 
### 3.2 Create Model

I will create this as a dummy-classifier, that way I can just generate samples as required.

Note that this will be fit on the entire Tuning Set for more statistical strength.

In [None]:
if EXPLORE_PRIOR:
    # Fit Model
    y = np.append(y_nod['Tune'].to_numpy().squeeze(), np.arange(1, 8)) # Add a count for each, just to make sure that all appear.
    prior_mdl = DummyClassifier(strategy='prior').fit(None, y)
    # Store Model
    utils.make_dir(MODELS); joblib.dump(prior_mdl, os.path.join(MODELS, 'Prior.jlib'))