# Optimise Models for Observability Classification

## 0. Scope

### 0.1 Aim
 1. Finalise a Classifier to Predict Observability

### 0.2 Requirements
 1. DataFrame of Reduced Feature-Set (as per `Explore_Features.ipynb`)
 2. List of Snippets (as per `Data/Build_Behaviour_Dataset.ipynb`)
 3. Original Data of Classifications (as per `Data/Build_BehaviourDataset.ipynb`) *[For Analysing Mistakes]*

In [None]:
# General Libraries
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from mpctools.extensions import utils, mplext, skext, npext
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from IPython.display import display, HTML
from mpctools.parallel import ProgressBar
from sklearn.dummy import DummyClassifier
from sklearn import naive_bayes as sknb
from matplotlib import markers as mrks
from matplotlib import pyplot as plt
import sklearn.metrics as skmetrics
from collections import defaultdict
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
import copy
import sys
import os

# Add the Project Directories to the path
sys.path.append('../../../../')

# Add specific project tools
from Tools.Features import ObservabilityFeatures

# Finally Display Options
display(HTML("<style>.container { width:95% !important; }</style>"))
pd.set_option('display.max_columns', 50)

In [None]:
# ========== Definitions ========== #
# ------ Paths ------ #
FEATURES = os.path.join(BASE_RESULTS, 'Features')
FIGURES = os.path.join(BASE_RESULTS, 'Figures'); utils.make_dir(FIGURES)
MODELS = os.path.join(BASE_RESULTS, 'Models'); utils.make_dir(MODELS)
RESULTS = os.path.join(BASE_RESULTS, 'Scores'); utils.make_dir(RESULTS)
PREDICT = os.path.join(BASE_RESULTS, 'Predictions'); utils.make_dir(PREDICT)

FINAL_MDL = 'OClass.jlib'

# ----- Data Definitions ----- #
RANDOM_STATE = 101
CV_FOLDS = 10

# ----- Model Parameters ----- #
LOGISTIC_C = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100]

TREE_MAX_DEPTH = [5, 7, 9]
TREE_MIN_SPLIT = [16, 32, 64]
TREE_MIN_LEAF = [0.001, 0.01, 0.05, 0.1]
TREE_ALPHAS = [0.01, 0.1, 0.5, 1.0]

RF_MAX_DEPTH = [5, 7, 9]
RF_MIN_SPLIT = [8, 16, 32]
RF_MIN_LEAF = [0.0001, 0.001, 0.01]
RF_ALPHAS = [0.001, 0.01, 0.1]

SVM_C = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
SVM_KERNEL = ['linear', 'rbf', 'sigmoid']

MLP_HIDDEN = [(25, ), (20, 5), (15, 6), (15, 3)]
MLP_FUNC = ['logistic', 'relu']
MLP_ALPHA = [0.0001, 0.001, 0.01]
MLP_LR = [1e-5, 1e-4, 1e-3]

ADA_CLASS = [None, LogisticRegression()]
ADA_N_EST = [30, 50, 100]

COSTS = [1, 2, 3, 4, 5, 10]

# ========= Execution Control ========== #
OPTIMISE_MODELS = False
ANALYSE_OPTIMAL = True
TRAIN_OPTIMAL = False      # Also, generates Predictions
EVALUATE_TEST = True

In [None]:
# ========= Functions ========== #
def compare_models(y_true, y_pred):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'Acc (B)': skmetrics.balanced_accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', zero_division=0),
    }

def score_models(y_true, y_pred):
    return {
        'Acc.': skmetrics.accuracy_score(y_true, y_pred),
        'F1': skmetrics.f1_score(y_true, y_pred, average='macro', zero_division=0),
        'W (%)': ((y_true==1) & (y_pred==0)).sum() * 100 / (y_true==1).sum(),
        'U (%)': ((y_true==0) & (y_pred==1)).sum() * 100 / (y_true==0).sum(),
    }

def display_roc(X, y, axs, mdls, flip=False, scores=None, title=True):
    for ax, (ds_name, (_X, _y)) in zip(axs, utils.dzip(X, y)):
        # Iterate over Models
        for mdl_name, mdl in mdls.items():
            fpr, tpr, thr = skmetrics.roc_curve(_y, mdl.predict_proba(_X)[:, 1], pos_label=1)
            _label = f'{mdl_name}: $A_{{ROC}}$={skmetrics.auc(fpr, tpr):.3f}'
            if scores is not None:
                _label += f' F$_1$={scores[mdl_name]:.3f}'
            # Plot ROC
            if flip:
                _line = skmetrics.RocCurveDisplay(fpr=tpr, tpr=fpr).plot(lw=3, ax=ax, label=_label).line_
            else:
                _line = skmetrics.RocCurveDisplay(fpr=fpr, tpr=tpr).plot(lw=3, ax=ax, label=_label).line_
            # Mark 0.5 Prediction point
            pr_thr = np.abs(thr - 0.5).argmin()
            if flip:
                ax.plot(tpr[pr_thr], fpr[pr_thr], 'X', ms=14, c=_line.get_c())
            else:
                ax.plot(fpr[pr_thr], tpr[pr_thr], 'X', ms=14, c=_line.get_c())
        ax.plot([0, 1], [0, 1], color="navy", lw=3, linestyle="--", label='Baseline')
        h, l = ax.get_legend_handles_labels()
        h.insert(-1, plt.Line2D([0], [0], linestyle='None', marker='X', ms=14, c='grey')); l.insert(-1, 'Operating Threshold')
        ax.legend(handles=h, labels=l, loc=4, prop={'family': 'monospace', 'size': 23}, handlelength=1.5, handletextpad=0.5, borderaxespad=0.2)
        if title: ax.set_title(ds_name, fontsize=23)
        ax.tick_params(labelsize=23)
        ax.set_aspect('equal')
        if flip:
            ax.set_ylabel('False Observable Rate', fontsize=23); ax.set_xlabel('True Observable Rate', fontsize=23)
        else:
            ax.set_xlabel('False Observable Rate', fontsize=23); ax.set_ylabel('True Observable Rate', fontsize=23)
        
def train_mlp(mlp, X, y, max_epochs=200, scorer=skmetrics.balanced_accuracy_score):
    best_ = (np.NINF, None)
    for ep in range(max_epochs):
        s_ = scorer(y[1], mlp.partial_fit(X[0], y[0], np.arange(2)).predict(X[1]))
        if s_ > best_[0]: 
            best_ = (s_, copy.deepcopy(mlp))
    return best_

def count_run_lengths(grp):
    run_lengths = defaultdict(list)
    for r, e in zip(*npext.run_lengths(grp, how='A', return_values=True)):
        run_lengths[e].append(r)
    for e, r in run_lengths.items():
        run_lengths[e] = pd.value_counts(r)
    return pd.DataFrame(run_lengths)

## 1. First Steps

In [None]:
# Load the Data and Group
data = {ds: pd.read_pickle(os.path.join(FEATURES, f'{ds}.df'), compression='bz2') for ds in ('Tune', 'Test')}
data = {
    'Train': data['Tune'][data['Tune'][('Target', 'DataSet')] == 'Train'].sample(frac=1, random_state=RANDOM_STATE),
    'Validate': data['Tune'][data['Tune'][('Target', 'DataSet')] == 'Validate'].sample(frac=1, random_state=RANDOM_STATE),
    'Tune': data['Tune'],
    'Test': data['Test'],
}

# Split as X/y:
#   N.B. The positive class is the Observable. This impacts the F_1/Jaccard Scores
X_all = {n: d['Features'] for n, d in data.items()}
y_all = {n: d[('Target', 'Observable')].astype(int) for n, d in data.items()}

# Prepare Placeholders for Scores
scores = defaultdict(dict)

## 2. Optimise Classifiers

This is the search over Architectures and optimisation of Hyper-Parameters.

***N.B.***:
 * Optimisation is always on the Training Set, with model comparison on the Validation Set.
 * If hyper-parameter tuning is required, Training Set is split using 10-Fold CV.

In [None]:
if OPTIMISE_MODELS:
    # Extract Subset of Data
    X, y = utils.subdict(X_all, ('Train', 'Validate')), utils.subdict(y_all, ('Train', 'Validate'))
    # Create Placeholder for Model
    utils.make_dir(os.path.join(MODELS, 'Predictors')); utils.make_dir(os.path.join(MODELS, 'Parameters'))

### 2.1 Baseline Model

This is just the prior distribution with majority-class prediction.

In [None]:
# Train Classifier
if OPTIMISE_MODELS:
    clf = DummyClassifier(strategy='prior').fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', 'Baseline.jlib'))

    # Score
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['Prior'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. From the point of view of raw accuracy, this will be hard to beat (due to the imbalanced nature of our problem).
 2. This discrepancy is however captured through the (macro-averaged) $F_1$ score.

### 2.2 Logistic Regression

Standard Classifier. I will optimise the regularisation parameter with and without balanced weighting.

#### 2.2.1 Imbalanced Classes

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(LOGISTIC_C)).reset('Optimising LR:')
    for c in LOGISTIC_C:
        nll = np.min(cross_val_score(LogisticRegression(C=c, max_iter=500), X['Train'], y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
        if nll > best_mdl[0]:
            best_mdl = (nll, c)
        progress.update()

    # Keep Best Model
    print(f'Re-Training LR Model with C={best_mdl[1]} on all Training Data ... ... ... ', end='')
    clf = LogisticRegression(C=best_mdl[1], max_iter=5000).fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'LogReg.Imb.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'LogReg.Imb.best'), f'C={best_mdl[1]}')
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['LR (Imb)'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. The good news is that this improves on the Baseline classifier:
     * It is a strict improvement on all fronts on the Training Set.
     * The performance is comparable (between models) on the validation set for Acc and improved for $F_1$/Balanced Accuracy
 2. Note that since this will be a hard gating mechanism for the behaviour, we do not care much about LL.

#### 2.2.2 Balanced Classes

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(LOGISTIC_C)).reset('Optimising LR:')
    for c in LOGISTIC_C:
        nll = np.min(cross_val_score(LogisticRegression(C=c, max_iter=500, class_weight='balanced'), X['Train'], y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
        if nll > best_mdl[0]:
            best_mdl = (nll, c)
        progress.update()

    # Keep Best Model
    print(f'Re-Training LR Model with C={best_mdl[1]} on all Training Data ... ... ... ', end='')
    clf = LogisticRegression(C=best_mdl[1], max_iter=5000, class_weight='balanced').fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'LogReg.Bal.best'), f'C={best_mdl[1]}')
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['LR (Bal)'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. Performance is worse on Accuracy but better on Balanced Accuracy/F1
 2. This is probably solely down to them operating at different fronts.

#### 2.2.3 ROC Curves

Note:
 1. I plot these for the Training/Validation Set Independently
 2. I still use the point of view of the Observable as Positive Class.

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {f'LR ({mode})': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.{mode}.jlib')) for mode in ('Imb', 'Bal')}

    # Now Iterate over DataSets
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_logreg.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. Performance is comparable on the Training Set, with the curves following each other.
 2. On the Validation-Set, performance is slightly shifted:
     * In particular, including a weighting on the training set seems to have an effect... note how the shape differs.
 3. Given that the region I will probably operate in is the top-right corner, I would prefer the Balanced Classifier going forwards.
     
#### 2.2.4 Feature Importance

Out of curiosity:

In [None]:
if OPTIMISE_MODELS:
    fig, ax = plt.subplots(1, 1, figsize=[20, 5])
    clf = joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib'))
    coefs = np.abs(clf.coef_.squeeze()); coefs_scaled = coefs * np.ptp(X['Train'], axis=0)
    feat_gt2 = np.where(coefs_scaled > 2)[0]
    cnames = np.asarray(['.'.join((c[0], c[-1])) for c in X['Train'].columns.str.split('.')])
    # Plot
    ax.bar(np.arange(len(coefs)), coefs, label='Raw Coeffs'); ax.bar(np.arange(len(coefs_scaled)), coefs_scaled, label='Scaled Coeffs')
    ax.set_xticks(feat_gt2); ax.set_xticklabels(cnames[feat_gt2], rotation=45, ha='right')
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.legend(fontsize=15)
    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_features_logreg.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. It seems the most important features are the visual features after all!
     * Note, that I am 'scaling' the coefficients to the PTP value in that dimension (orange), and hence, can judge better).
 2. There might be scope for some cross-products of features: for e.g. between TIM and RFID (to capture if there is synergy between detection and bbox) and between TIM and LFB.
     * However, I can't really go into depth at this stage.

### 2.3 Random Forests

The hope is that this is a non-linear classifier: note that I have skipped decision trees altogether, as they are sub-optimal in this case. Also, DTs provide more jagged thresholds.}

#### 2.3.1 Train Model

Again, I will only focus on Balanced Class-weights.

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(RF_MAX_DEPTH) * len(RF_MIN_SPLIT) * len(RF_MIN_LEAF) * len(RF_ALPHAS)).reset('Optimising RF:')
    for d in RF_MAX_DEPTH:
        for s in RF_MIN_SPLIT:
            for l in RF_MIN_LEAF:
                for a in RF_ALPHAS:
                    mdl = RandomForestClassifier(max_depth=d, min_samples_split=s, min_samples_leaf=l, ccp_alpha=a, class_weight='balanced', random_state=RANDOM_STATE)
                    nll = np.min(cross_val_score(mdl, X['Train'], y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
                    if nll > best_mdl[0]:
                        best_mdl = (nll, (d, s, l, a))
                    progress.update()

    # Keep Best Model
    print(f'Re-Training Model with D={best_mdl[1][0]}, S={best_mdl[1][1]}, L={best_mdl[1][2]}, A={best_mdl[1][3]} on all Training Data ... ... ... ', end='')
    clf = RandomForestClassifier(max_depth=best_mdl[1][0], min_samples_split=best_mdl[1][1], min_samples_leaf=best_mdl[1][2], ccp_alpha=best_mdl[1][3], class_weight='balanced', random_state=RANDOM_STATE)
    clf.fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'RForest.Bal.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'RForest.Bal.best'), f'D={best_mdl[1][0]}, S={best_mdl[1][1]}, L={best_mdl[1][2]}, A={best_mdl[1][3]}')
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['RF (Bal)'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. This seems to be some of the best performance so far in terms of balancing predicting Observability/Not Observability.
 2. It also does pretty well on the Validation Set.

#### 2.4.2 ROC Curves

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {
        'LR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'RF': joblib.load(os.path.join(MODELS, 'Predictors', f'RForest.Bal.jlib')),
    }

    # Now Iterate over DataSets
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_rf.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. Performance is wonderful on the Training Set (AUC=0.96)
 2. Unfortunately, this is not really the case when look at the validation set ROC --- performance is sub-par to LR in almost all thresholds.
 3. Might be a case of overfitting: it might be that there is not much more signal that can be extracted!

### 2.4 Naive Bayes

#### 2.4.1 Gaussian NB

This is one of the classic algorithms. Note that at least two assumptions in the model are wrong:
 1. Conditional Independence: especially for the One-Hot encoded RFID.Pos (might be that the PCA components are uncorrelated).
 2. Feature-Vectors are most probably not gaussian distributed (especially given the 0-1 bounding).
 
In this case, there is no need to optimise hyper-parameters.

In [None]:
if OPTIMISE_MODELS:
    # Keep Best Model
    print(f'Training Gaussian NB model on all Training Data ... ... ... ', end='')
    clf = sknb.GaussianNB().fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'NB.Gaus.jlib'))
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['NB (Gaus)'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. Seems pretty Horrible.

#### 2.4.2 Multinomial NB

This is another classic algorithm. Again, there are two wrong assumptions:
 1. Conditional Independence (as before)
 2. Binary/Count Data: I actually have continuous real-valued data.
 
However, it might work better than the Gaussian NB.

In [None]:
if OPTIMISE_MODELS:
    # Keep Best Model
    print(f'Training Multinomial NB model on all Training Data ... ... ... ', end='')
    clf = sknb.MultinomialNB().fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib'))
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['NB (Mult)'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. This basically reverts to almost the dummy classifier.

#### 2.4.3 ROC Curves

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {
        'LR     ': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        **{f'NB ({mode})': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.{mode}.jlib')) for mode in ('Gaus', 'Mult')}
    }

    # Now Iterate over DataSets
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_nb.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. The Gaussian is perhaps the better performer of the NB families...
 2. ... although at the region we are interested in, the Multinomial is probably better.
 3. A better algorithm to fit would be to have a different feature-probability for each dimension, but not sure if worth the effort.

### 2.5 Support Vector Machine

My motivation here is mostly to have access to kernel functions of the input (and hence some non-linearities).

#### 2.5.1 Train

***N.B.***
 1. For ROC Curve, will use the decision function as a pseudo-probability (standardised to lie in the range 0-1)

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(SVM_C) * len(SVM_KERNEL)).reset('Optimising SVC:')
    for c in SVM_C:
        for k in SVM_KERNEL:
            mdl = skext.SVCProb(C=c, kernel=k, probability=False, cache_size=1000, max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
            acc = np.min(cross_val_score(mdl, X['Train'], y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
            if acc > best_mdl[0]:
                best_mdl = (acc, (c, k))
            progress.update()

    # Keep Best Model
    print(f'Re-Training Model with C={best_mdl[1][0]}, K={best_mdl[1][1]} on all Training Data ... ... ... ', end='')
    clf = skext.SVCProb(C=best_mdl[1][0], kernel=best_mdl[1][1], probability=True, cache_size=2000, random_state=RANDOM_STATE).fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'SVM.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'SVM.best'), f'C={best_mdl[1][0]}, K={best_mdl[1][1]}')
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['SVM'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

##### Comments
 1. This seems to again never be predicting the negative class.
 
#### 2.5.2 ROC Curve

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {
        'LR ': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'SVM': joblib.load(os.path.join(MODELS, 'Predictors', f'SVM.jlib')),
    }

    # Now Display ROC
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_svm.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. It is comparable on the Training Set, and arguable worse on the Validation-Set: specifically, it is always worse than the LR.

### 2.6 Neural Network

Just for the sake of completeness, will use a single hidden-layer or double-layer NN, keeping in mind the reduced the number of parameters. I will use the SKLearn MLP for this.

#### 2.6.1 Train

Optimisation of Parameters is done using the early-stopping in-built into the SKLearn framework: however, when running the final model, I do this based on the validation set and taking the best possible point.

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(MLP_HIDDEN) * len(MLP_ALPHA) * len(MLP_FUNC) * len(MLP_LR), prec=2).reset('Optimising MLP:')
    for h in MLP_HIDDEN:
        for a in MLP_ALPHA:
            for f in MLP_FUNC:
                for l in MLP_LR:
                    mdl = MLPClassifier(h, f, alpha=a, batch_size=64, learning_rate_init=l, random_state=RANDOM_STATE, early_stopping=True)
                    acc = np.min(cross_val_score(mdl, X['Train'].to_numpy(), y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
                    if acc > best_mdl[0]:
                        best_mdl = (acc, (h, f, a, l))
                    progress.update()

    # Keep Best Model
    print(f'Re-Training Model with H={best_mdl[1][0]}, F={best_mdl[1][1]}, A={best_mdl[1][2]}, L={best_mdl[1][3]} on all Training Data ... ... ... ', end='')
    clf = MLPClassifier(best_mdl[1][0], best_mdl[1][1], alpha=best_mdl[1][2], batch_size=64, learning_rate_init=best_mdl[1][3], random_state=RANDOM_STATE, early_stopping=False)
    clf = train_mlp(clf, (X['Train'], X['Validate']), (y['Train'], y['Validate']))[1]
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'MLP.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'MLP.best'), f'H={best_mdl[1][0]}, F={best_mdl[1][1]}, A={best_mdl[1][2]}, L={best_mdl[1][3]}')
    print('Done!')

    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['MLP'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

#### 2.6.2 ROC Curves

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {
        'LR ': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'MLP': joblib.load(os.path.join(MODELS, 'Predictors', f'MLP.jlib')),
    }

    # Now Display ROC
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_mlp.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. The MLP is marginally better on the Training Set
 2. On the validation set, it follows it quite closely, but is strictly worse throughout.

### 2.7 Boosted Classifiers

In [None]:
if OPTIMISE_MODELS:
    # Prepare for Search: this will hold the best score, as well as the associated parameters
    best_mdl = (np.NINF, None)

    # Optimise Model
    progress = ProgressBar(len(ADA_CLASS) * len(ADA_N_EST), prec=2).reset('Optimising ADA')
    for c in ADA_CLASS:
        for n in ADA_N_EST:
            mdl = AdaBoostClassifier(base_estimator=c, n_estimators=n, random_state=RANDOM_STATE)
            acc = np.min(cross_val_score(mdl, X['Train'].to_numpy(), y['Train'], cv=CV_FOLDS, scoring='balanced_accuracy', n_jobs=-1))
            if acc > best_mdl[0]:
                best_mdl = (acc, (c, n))
            progress.update()
    
    print(f'Re-Training Model with C={best_mdl[1][0]}, N={best_mdl[1][1]} on all Training Data ... ... ... ', end='')
    clf = AdaBoostClassifier(base_estimator=best_mdl[1][0], n_estimators=best_mdl[1][1], random_state=RANDOM_STATE).fit(X['Train'], y['Train'])
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'ADA.jlib'))
    Utilities.one_shot_write(os.path.join(MODELS, 'Parameters', 'ADA.best'), f'C={best_mdl[1][0]} N={best_mdl[1][1]}')
    print('Done!')
    
    # Score Best Model
    print('Scoring Model on Training/Validation Set')
    for ds_name, (_X, _y) in utils.dzip(X, y):
        preds = clf.predict(_X)
        scores[ds_name]['ADA'] = compare_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in scores.values()], axis=0, keys=scores.keys()).T
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    display(scores_df)

In [None]:
if OPTIMISE_MODELS:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(1, 2, figsize=[16, 8], tight_layout=True)
    mdls = {
        'LR ': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'ADA': joblib.load(os.path.join(MODELS, 'Predictors', f'ADA.jlib')),
    }

    # Now Display ROC
    display_roc(X, y, axs, mdls)

    # Save
    plt.savefig(os.path.join(FIGURES, 'fig_roc_ada.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. Nope: consistently worse.

## 3. Choosing the Operating Point

The aim here is to analyse the level of wastefulness vs unreliability of the top-performing model, and in so doing, choose the best operating point.

### 3.1 Theory

#### 3.1.1 Definitions

Assume the following definitions:

|            |              |   **Predicted**   |                 |
|------------|:------------:|:-----------------:|-----------------|
|            |              |    Observed       | Not Observed    |
| **Actual** | Observed     | TP                | _Wasteful_ (FN) |
|            | Not Observed | _Unreliable_ (FP) | TN              |

Our errors are one of two kinds:
 1. The mouse is really `Not Observed` but we predict that it is `Observable`: this would yield potentially ***Unreliable*** behaviour labels, which we want to avoid.
 2. The mouse is actually `Observable` but we predict `Not Observable`: this is ***Wasteful*** of samples available for retrieving signal from.

#### 3.1.2 Motivation
Plotting these gives an alternative viewpoint to the typical ROC Curves.
 1. With ROC plots, the TPR and FPR are not directly comparable: this is because they are normalised to different ratios, and hence a unit increase in one is not equivalent to the same in the other.
 2. On the other hand, in the W/U curve, the scales are such that they are more comparable.

That being said, note that we can get the values directly from the ROC Curve. Specifically:
 * ***Wasteful*** = (1 - TPR) x Pos
 * ***Unreliable*** = FPR x Neg
 
#### 3.1.3 Cost Curves
Let us assume a specific cost ratio $r$ which can be used to weight the cost of Unreliable samples to Wasteful samples.
I define the cost of the classifier at an operating threshold with $W$ Wasted and $U$ unreliable samples (normalised by $N$ total samples) as:
$$ C(r) = \frac{1}{N}\left(W + Ur\right) .$$

We can plot these as a function of the threshold for various values or $r$.

### 3.2 Comparison of All Relevant Models

This is mostly to show on report.

In [None]:
if ANALYSE_OPTIMAL:
    # Load Data
    X, y = utils.subdict(X_all, ('Train', 'Validate')), utils.subdict(y_all, ('Train', 'Validate'))
    scores_df = pd.read_pickle(os.path.join(RESULTS, 'Scores.Compare.df'), compression='bz2')
    scores_df = scores_df.loc[['Prior', 'LR (Bal)', 'RF (Bal)', 'NB (Mult)', 'SVM', 'MLP']]
    scores_df = scores_df.rename({'LR (Bal)': 'LgR', 'RF (Bal)': 'RF ', 'NB (Mult)': 'NB ', 'MLP': 'NN '})
    
    # Prepare Figure & Load Models
    fig_1, ax_1 = plt.subplots(1, 1, figsize=[10, 10], tight_layout=True, num=1)
    fig_2, ax_2 = plt.subplots(1, 1, figsize=[10, 10], tight_layout=True, num=2)
    mdls = {
        'LgR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'NB ': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib')),
        'RF ': joblib.load(os.path.join(MODELS, 'Predictors', f'RForest.Bal.jlib')),
        'SVM': joblib.load(os.path.join(MODELS, 'Predictors', f'SVM.jlib')),
        'NN ': joblib.load(os.path.join(MODELS, 'Predictors', f'MLP.jlib')),
    }

    # Now Display ROC
    display_roc(X, y, [ax_1, ax_2], mdls, flip=False, scores=scores_df[('Validate', 'F1')], title=False)

    # Save
    plt.figure(1); plt.savefig(os.path.join(FIGURES, 'fig_beh_obs_roc_train.png'), bbox_inches='tight', dpi=200)
    plt.figure(2); plt.savefig(os.path.join(FIGURES, 'fig_beh_obs_roc_validate.png'), bbox_inches='tight', dpi=200)
    
    # Display also the Scores
    print(scores_df.to_latex(float_format="%.3f", multicolumn_format='c'))

##### Comments
 1. I think the candidates to explore are the LR and NB models
 2. The operating points are always too high (apart from the RF for the training-set)

### 3.3 Exploration of Operating Points

This will use the previous Training/Validation Split and the pre-saved models, for more unbiased estimates.

In [None]:
if ANALYSE_OPTIMAL:
    # Load Data
    X, y = utils.subdict(X_all, ('Train', 'Validate')), utils.subdict(y_all, ('Train', 'Validate'))

#### 3.3.1 Cost Curves

These are the relative costs for different values of r

In [None]:
if ANALYSE_OPTIMAL:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(2, 2, figsize=[25, 10], tight_layout=True, sharey='row', sharex='col')
    mdls = {
        'LR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'NB': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib')),
    }

    # Now Iterate over DataSets
    for d_i, (ds_name, (_X, _y)) in enumerate(utils.dzip(X, y)):
        for m_i, (mdl_name, mdl) in enumerate(mdls.items()):
            ax = axs[d_i, m_i]
            fpr, tpr, thr = skmetrics.roc_curve(_y, mdl.predict_proba(_X)[:, 1], pos_label=1)
            unreliable = fpr[1:] * (_y == 0).sum(); wasteful = (1 - tpr[1:]) * (_y == 1).sum()
            # Iterate over Costs
            for r in COSTS:
                cost = (wasteful + unreliable * r)/(len(_y))
                ax.plot(thr[1:], cost, label=f'r={r}')
            ax.legend(loc=0, prop={'family': 'monospace', 'size': 15})
            if d_i == 0:
                ax.set_title(f'{mdl_name}', fontsize=20)
            ax.tick_params(labelsize=15); ax.set_xlabel('THR', fontsize=18)
        axs[d_i, 0].set_ylabel(f'Average Cost ({ds_name})', fontsize=18)
        
    # Save
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_mdls_costs.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. The curves are neither monotonic nor are they convex.
 2. There is rarely a clear minimum for the lower cost-ratios: the curves are often quite flat.
 3. This gets a bit better for the higher cost ratios (5/10).

#### 3.3.2 Net Benefit Ratio

I will do the Net Benefit Analysis

In [None]:
if ANALYSE_OPTIMAL:
    # Prepare Figure & Load Models
    fig, axs = plt.subplots(2, 1, figsize=[20, 10], tight_layout=True)
    mdls = {
        'LR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'NB': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib')),
    }

    # Now Iterate over DataSets
    for ax, (ds_name, (_X, _y)) in zip(axs, utils.dzip(X, y)):
        for mdl_name, mdl in mdls.items():
            nb, thr = skext.net_benefit_curve(_y, mdl.predict_proba(_X)[:, 1], pos_label=1)
            ax.plot(thr, nb, label=mdl_name, lw=2)
        ax.legend(loc=1, prop={'family': 'monospace', 'size': 15})
        ax.set_title(ds_name, fontsize=20)
        ax.tick_params(labelsize=15); ax.set_xlabel('Threshold (t)', fontsize=18); ax.set_ylabel('Net Benefit', fontsize=18)
        
    # Save
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_mdls_net_benefit.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. These are proving very problematic to interpret.
 2. I believe we would be operating in the range around t=0.8:
     * Assume that we are willing to trade 5 wasteful for 1 unreliable.
     * This means our B/H ratio is 1:5 or 0.2
     * This implies that t = 1 / (1 + B/H) = 1 / (1 + 0.2) = 0.8333
 3. In any case, this implies that the NB is the best model for most of the range we are interested in!
     * This is probably because of the relative number of samples.

#### 3.3.3 Wasteful/Unreliable Curves

This shows the ratio of wasteful to unreliable at different thresholds. I also mark:
  * Point at which the cost is minimal for cost ratio $r=5$: this seems reasonable.
  * Point at which # Negs predicted equals the true # Negs in the data
  * Point at which W is maximum we can tolerate: I am using 0.08, which is roughly equivalent to the ratio of Negatives in the data.
  
Note that the y-axis is scaled to 2x the x-axis for clarity.

In [None]:
if ANALYSE_OPTIMAL:
    # Prepare Figure & Load Models
    mdls = {
        'LgR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')),
        'NB': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib')),
    }
    # What to plot - Cost-Ratio and Max Waste
    r = 5; w = 0.08

    # Now Iterate over DataSets
    costings = {'Train': defaultdict(dict), 'Validate': defaultdict(dict)}
    for ds_name, (_X, _y) in utils.dzip(X, y):
        fig, ax = plt.subplots(1, 1, figsize=[20, 5], tight_layout=True)
        w_pt = _y.value_counts()[1] * w
        # Iterate over Models
        for mdl_name, mdl in mdls.items():
            probs = mdl.predict_proba(_X)[:, 1]
            fpr, tpr, thr = skmetrics.roc_curve(_y, probs, pos_label=1)
            unreliable = fpr * (_y == 0).sum(); wasteful = (1 - tpr) * _y.sum(); cost = (wasteful + unreliable * r) / len(_y)
            _line = skmetrics.RocCurveDisplay(fpr=wasteful, tpr=unreliable).plot(lw=3, ax=ax, label=f'{mdl_name}').line_
            # Finally, point of max waste
            m = np.abs(wasteful - w_pt).argmin()
            costings[ds_name][mdl_name][f'W={w*100:.0f}%'] = {'Cost': cost[m], 'Waste': wasteful[m], 'UnRel': unreliable[m], 'Thr': thr[m]}
        ax.axvline(w_pt, 0., 0.95, ls='--', lw=3, c='k', label=f'Wasted={w * 100:.0f}%')
        # Add Legend Handles
        h, l = ax.get_legend_handles_labels()
        ax.legend(handles=h, labels=l, loc=1, ncol=1, prop={'family': 'monospace', 'size': 23})
#         ax.set_title(ds_name, fontsize=20)
        ax.tick_params(labelsize=23); ax.set_xlabel('Wasteful', fontsize=23); ax.set_ylabel('Unreliable', fontsize=23)
        ax.set_aspect(3.5)
        plt.savefig(os.path.join(BASE_RESULTS, FIGURES, f'fig_beh_obs_unrel_v_waste_{ds_name.lower()}.png'), bbox_inches='tight', dpi=200)

    # Display Min-Cost
    costings = pd.concat({d: pd.concat({k: pd.DataFrame(m) for k, m in ds.items()}) for d, ds in costings.items()})
    costings = costings.T.stack(0).reorder_levels((1, 0)).reorder_levels((1, 0), 1).sort_index(axis=0).sort_index(axis=1)
    display(costings.drop(columns=['Thr']).style.format(precision=2))
    costings.to_pickle(os.path.join(RESULTS, 'Scores.Costings.df'), compression='bz2')
    
    # Also print
    cost_val = costings.loc['Validate', ].rename(index={'Neg': 'GT Not Obs.', 'W=8%': 'W = 8%', 'r=5': 'Min @ r=5'}).drop(columns=['Thr'], level=0)
    cost_val = cost_val.rename(columns={'Cost': 'A_5', 'UnRel': '|U|', 'Waste': '|W|'})
    print(cost_val.T.unstack(0).to_latex(float_format='%.2f', multicolumn_format='c'))

##### Comments
 1. In terms of points of equal negatives, clearly these are not very good points.
     * This is also the only point at which the Costing is better for NB than LR
 2. The point at $r=5$ is not bad:
     * LR outperforms NB
     * Ratio of W to U seems a bit off
 3. The point at $w=.08$ is also reasonable
     * Actually, I chose it given the knowledge we had before (that LR did about .08 wasted -- I did not want to go to 0.1 as it might waste way too much)

### 3.4 Explore Mistakes

I will take the point at $w=0.8$. I will analyse this on the validation-set.

In [None]:
if ANALYSE_OPTIMAL:
    cost_table = pd.read_pickle(os.path.join(RESULTS, 'Scores.Costings.df'), compression='bz2')
    best_thr = cost_table.loc[('Validate', 'W=8%'), 'Thr']
    m_r = {'LR': joblib.load(os.path.join(MODELS, 'Predictors', f'LogReg.Bal.jlib')), 'NB': joblib.load(os.path.join(MODELS, 'Predictors', f'NB.Mult.jlib'))}

#### 3.4.1 Wastefulness

In [None]:
if ANALYSE_OPTIMAL:
    # Prepare
    behaviours = pd.read_pickle(ANNOTATIONS, compression='bz2').stack(0)['GT.Behaviour']  # Load Behaviours
    
    # First the Distribution of Wasted/Useful
    fig, axs = plt.subplots(1, len(m_r), figsize=[25, 6], tight_layout=True, sharey=True)
    for i, (mdl, clf) in enumerate(m_r.items()):
        clf = skext.ThresholdedClassifier(clf, best_thr[mdl])
        wasteful = ((clf.predict(X['Validate']) == 0) & (y['Validate'] == 1))
        wasted_runs = wasteful.map({True: 1}).groupby(level=(0, 1, 2, 4)).apply(lambda x: pd.value_counts(npext.run_lengths(x, how='I')))
        wasted_runs = wasted_runs.unstack(-1).sum()
        interrupted = wasteful.map({False: 1}).groupby(level=(0, 1, 2, 4)).apply(lambda x: pd.value_counts(npext.run_lengths(x, how='I')))
        interrupted = interrupted.unstack(-1).sum()
        waste_analysis = pd.concat([wasted_runs, interrupted], axis=1, keys=['Wasted', 'Usable']).fillna(0)
        waste_analysis /= waste_analysis.sum()
        waste_analysis.plot.bar(ax=axs[i], fontsize=13)
        axs[i].set_title(mdl, fontsize=15); axs[i].set_xlabel('Interval Length', fontsize=15); axs[i].set_ylabel('Fraction', fontsize=15); axs[i].legend(fontsize=15)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_waste_intervals_validate.jpg'), bbox_inches='tight', dpi=300)
    
    # Now the Distribution of Behaviours in the Wasted Category.
    # --- Compute --- #
    wasted = {}
    for i, (mdl, clf) in enumerate(m_r.items()):
        clf = skext.ThresholdedClassifier(clf, best_thr[mdl])
        wasteful = ((clf.predict(X['Validate']) == 0) & (y['Validate'] == 1)).to_frame('Wasted').join(behaviours, how='left')
        wasted[mdl] = wasteful.loc[wasteful['Wasted'], 'GT.Behaviour'].value_counts()
        wasted['All'] = wasteful['GT.Behaviour'].value_counts()
    wasted = pd.DataFrame(wasted).drop(0)
    nominal = wasted.sum().drop('All') / wasted.sum()['All']
    wasted = (wasted.T.drop('All') / wasted['All']).fillna(0)
    # --- Plot --- #
    fig, ax = plt.subplots(1, 1, figsize=[20, 6], tight_layout=True)
    bars = wasted.T.plot.bar(ax=ax).containers
    for mdl, bb in zip(wasted.index, bars):
        mplext.autolabel_bar(bb, [f'{f:.2f}' for f in wasted.T[mdl]], 13, ax=ax)
        ax.axhline([nominal[mdl]], c=bb[0].get_facecolor(), ls='--', lw=3, label=f'{mdl} Overall = {nominal[mdl]:.2f}')
    ax.legend(fontsize=16, ncol=2)
    ax.set_xticklabels(BORISParser.BEHAVIOURS(simplified=True).values(), rotation=30, ha='right'); ax.tick_params(labelsize=16)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_behwaste_validate.jpg'), bbox_inches='tight', dpi=300)

##### Comments

###### Interval Lengths
 1. Both seem to have similar trends.
 2. In general, the wasted samples seem to be concentrated in low-BTI lengths.
     * The majority of wasted samples are a single BTI in length, and hence, can in theory be filled in by a HMM.
     * The NB model does have a heavier tail it seems however --- and I can see some full-length runs lost.
 3. At the same time, note that the distance between wasted samples (as indicated by the run-length of Usable) is also quite peaked towards the lower ranges.
     * This implies that there are many instances of high-frequency switching between wasted and usable.
     * Note also that the Usable does not imply that all are actually correct: some may be Unreliable!
     
###### Distribution of Behaviours
 1. It does not follow the MAR assumption, but they are quite different between NB/LR
 2. In general, since Immobile is the most prevalent behaviour, it tends to dominate and provide the nominal rate, at least for the LR
 3. For the LR, Allo-Grooming is the worst offendor for exhibiting more than double the nominal rate of wasted samples: 
     * There are other behaviours which are positively skewed, but at least, they are comparable or less than the wastage in the NB.
 4. On the other hand, for the NB, almost all behaviours are under-represented:
     * Most of the wastage comes from the Immobile behaviour which is not that relevant.

#### 3.4.2 Unreliability

In [None]:
if ANALYSE_OPTIMAL:    
    # Just the Distribution of Unreliable/Useful
    fig, axs = plt.subplots(1, len(m_r), figsize=[25, 6], tight_layout=True, sharey=True)
    for i, (mdl, clf) in enumerate(m_r.items()):
        clf = skext.ThresholdedClassifier(clf, best_thr[mdl])
        unreliable = ((clf.predict(X['Validate']) == 1) & (y['Validate'] == 0))
        unrel_runs = unreliable.map({True: 1}).groupby(level=(0, 1, 2, 4)).apply(lambda x: pd.value_counts(npext.run_lengths(x, how='I')))
        unrel_runs = unrel_runs.unstack(-1).sum()
        interrupted = unreliable.map({False: 1}).groupby(level=(0, 1, 2, 4)).apply(lambda x: pd.value_counts(npext.run_lengths(x, how='I')))
        interrupted = interrupted.unstack(-1).sum()
        unrel_analysis = pd.concat([unrel_runs, interrupted], axis=1, keys=['Unreliable', 'Usable']).fillna(0)
        unrel_analysis /= unrel_analysis.sum()
        unrel_analysis.plot.bar(ax=axs[i], fontsize=13)
        axs[i].set_title(mdl, fontsize=15); axs[i].set_xlabel('Interval Length', fontsize=15); axs[i].set_ylabel('Fraction', fontsize=15); axs[i].legend(fontsize=15)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_unreliable_intervals_validate.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. The `Unreliables` tend to be short-length spans (1 BTI or a few)
     * Although now the LR has a heavier tail: (but smaller number of such detections).
     * There are some full-length runs (these are probably instances where I marked as Not Observable due to all mice huddling).
 2. The interruptions seem to be generally longer this time round (heavier tail than for wasted samples):
     * This possibly also stems from the limited number of truly Not Observable instances.

### 3.5 Concluding Remarks

All things considered, I might just go with the LR at the threshold given by $w=8%$:
 1. It does less wasteful and less unreliable
 2. It is a more rigorous method.
 3. I am a bit concerned about the larger amount of Allo-Grooming waste.

## 4. Generate Final Model

In [None]:
if TRAIN_OPTIMAL:
    # Extract Subset of Data
    X, y = utils.subdict(X_all, ('Tune', 'Test')), utils.subdict(y_all, ('Tune', 'Test'))
    w, c = 0.08, float(Utilities.one_shot_read(os.path.join(MODELS, 'Parameters', 'LogReg.Bal.best')).split('=')[1])
    # Create Placeholder for Predictions
    utils.make_dir(PREDICT)

### 4.1 Train Global Model

Training will be done on the entire Tuning-Set: however, in order to fit the threshold, I will use cross-validation predictions to generate out-of-sample data.

#### 4.1.1 Fit Threshold

In [None]:
if TRAIN_OPTIMAL:
    # ---- Shuffle for Randomness ---- #
    shuffle = np.random.default_rng(RANDOM_STATE).permutation(np.arange(len(X['Tune'])))
    _X, _y = X['Tune'].iloc[shuffle], y['Tune'].iloc[shuffle]
    # ---- Generate Predictions ---- #
    probs = cross_val_predict(LogisticRegression(C=c, max_iter=1000, class_weight='balanced'), _X, _y, cv=CV_FOLDS, n_jobs=-1, method='predict_proba')[:, 1]
    # ---- Plot Wastefulness/Unreliability Curve ---- #
    fig, ax = plt.subplots(1, 1, figsize=[25, 6])
    fpr, tpr, thr = skmetrics.roc_curve(_y, probs, pos_label=1)
    unreliable = fpr[1:] * (_y == 0).sum(); wasteful = (1 - tpr[1:]) * (_y == 1).sum()
    _line = skmetrics.RocCurveDisplay(fpr=wasteful, tpr=unreliable).plot(lw=2, ax=ax, label=f'W/U Curve').line_
    m = np.abs(wasteful - _y.value_counts()[1] * w).argmin(); best_thr = thr[m+1]
    ax.axvline(wasteful[m], ls='--', lw=3, c='k', label=f'W={w*100:.0f}%')
    # Some Annotation
    ax.annotate(
        f'W={wasteful[m]:.0f}, U={unreliable[m]:.0f} @ thr={best_thr:.2f}', size=15, c = _line.get_c(),
        xy=(wasteful[m]+200, unreliable[m]+50), xycoords='data', 
        xytext=(50, 50), textcoords='offset points',
        arrowprops={'arrowstyle': '->', 'connectionstyle': 'angle3'}
    )
    ax.legend(loc=0, prop={'family': 'monospace', 'size': 15})
    ax.tick_params(labelsize=15); ax.set_xlabel('Wasteful', fontsize=18); ax.set_ylabel('Unreliable', fontsize=18)
    ax.set_aspect(2)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_nb_unrel_v_waste_tune.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. It appears that using the whole data, even though we are using out-of-sample predictions, it gives a much smoother performance (although probably still not as optimal as if using the Training Set alone).
 
#### 4.1.2 Fit Model

In [None]:
if TRAIN_OPTIMAL:
    # Create Thresholded Classifier
    clf = skext.ThresholdedClassifier(LogisticRegression(C=c, max_iter=1000, class_weight='balanced'), threshold=best_thr)
    # Fit on Tuning Data (no need to shuffle here)
    clf.fit(X['Tune'], y['Tune'])
    # Store Model
    joblib.dump(clf, os.path.join(MODELS, 'Pipeline', f'ObserveClassify.jlib'))

### 4.2 Generate Predictions for all Data Splits

In [None]:
if TRAIN_OPTIMAL:
    # Load Model
    clf = joblib.load(os.path.join(MODELS, 'Pipeline', f'ObserveClassify.jlib'))
    # Predict on all Data
    for ds, _X in X.items():
        preds = clf.predict(_X)
        probs = clf.predict_proba(_X)
        df = pd.DataFrame(np.hstack([preds.reshape(-1, 1), probs]), index=_X.index, columns=['OC.Observe', 'OC.Prob.NotOb', 'OC.Prob.Obs'])
        df.to_pickle(os.path.join(PREDICT, f'Fixed.{ds}.df'), compression='bz2')

### 4.3 Generate Predictions under NB

For Comparison

#### 4.3.1 Fit Threshold

In [None]:
if TRAIN_OPTIMAL:
    # ---- Shuffle for Randomness ---- #
    shuffle = np.random.default_rng(RANDOM_STATE).permutation(np.arange(len(X['Tune'])))
    _X, _y = X['Tune'].iloc[shuffle], y['Tune'].iloc[shuffle]
    # ---- Generate Predictions ---- #
    probs = cross_val_predict(sknb.MultinomialNB(), _X, _y, cv=CV_FOLDS, n_jobs=-1, method='predict_proba')[:, 1]
    # ---- Find Threshold ---- #
    fpr, tpr, thr = skmetrics.roc_curve(_y, probs, pos_label=1)
    wasteful = (1 - tpr[1:]) * (_y == 1).sum()
    m = np.abs(wasteful - _y.value_counts()[1] * w).argmin()
    thr_nb = thr[m+1]

#### 4.3.2 Tune Model and Store as Alternative

In [None]:
if TRAIN_OPTIMAL:
    # Create Thresholded Classifier
    clf = skext.ThresholdedClassifier(sknb.MultinomialNB(), threshold=thr_nb)
    # Fit on Tuning Data (no need to shuffle here)
    clf.fit(X['Tune'], y['Tune'])
    # Store as alternative model
    joblib.dump(clf, os.path.join(MODELS, 'Predictors', f'ObserveClassify.Alt.jlib'))

## 5. Evaluate Test-Set

Finally, we evaluate the performance on the Test-Set

In [None]:
if EVALUATE_TEST:
    # Extract Subset of Data
    X, y = utils.subdict(X_all, ('Tune', 'Test')), utils.subdict(y_all, ('Tune', 'Test'))
    # Load Models
    mdls = {
        'Prior': DummyClassifier(strategy='prior').fit(X['Tune'], y['Tune']),
        'OC': joblib.load(os.path.join(MODELS, 'Pipeline', 'ObserveClassify.jlib')),
        'NB': joblib.load(os.path.join(MODELS, 'Predictors', f'ObserveClassify.Alt.jlib'))
    }

### 5.1 Evaluate Raw Statistics

In [None]:
if EVALUATE_TEST:
    # Score
    test_scores = defaultdict(dict)
    for ds_name, (_X, _y) in utils.dzip(X, y):
        for mdl, clf in mdls.items():
            preds = clf.predict(_X)
            test_scores[ds_name][mdl] = score_models(_y, preds)

    # Display
    scores_df = pd.concat([pd.DataFrame(a) for a in test_scores.values()], axis=0, keys=test_scores.keys()).T[['Tune', 'Test']]
    scores_df.to_pickle(os.path.join(RESULTS, 'Scores.Evaluate.df'), compression='bz2')
    display(scores_df.style.format(precision=3))
    
    # Print as Latex
    print(scores_df.to_latex(float_format="%.1f", multicolumn_format='c'))

##### Comments
 1. Performance has gone down somewhat.
     * Accuracy and W are comparable
     * F1 and U are significantly worse.
 2. There is a high degree of unreliability... in a way, maybe it is not so worthwhile after all?

### 5.2 Analysis of Mistakes

#### 5.2.1 To Appear in Report

In [None]:
if EVALUATE_TEST:
    # Prepare
    behaviours = pd.read_pickle(ANNOTATIONS, compression='bz2').stack(0)['GT.Behaviour']  # Load Behaviours
    error_code = (mdls['OC'].predict(X['Test']) * 10 + y['Test']).map({0: 'Correct', 1: 'Wasteful', 10: 'Unreliable', 11: 'Correct'})

    # First the Distribution of Wasted/Unreliable
    fig, ax = plt.subplots(1, 1, figsize=[20, 6], tight_layout=True); ax2 = ax.twinx()
    err_runs = error_code.groupby(level=(0, 1, 2, 4)).apply(count_run_lengths).unstack(-1).sum().unstack()
    err_runs = err_runs.T / err_runs.T.sum()
    # Group for Clarity
    grouped = err_runs[15:].groupby((err_runs[15:].index - 1) // 5).sum()
    grouped = grouped.rename(lambda ix: f'{ix*5+1} - {(ix+1)*5}')
    grouped = pd.concat([err_runs[:15], grouped])
    grouped.plot.bar(ax=ax, width=0.75)
    # Find cumulative and plot as well
    grouped.cumsum().plot.line(ax=ax2, legend=False, fontsize=15, lw=3)
    ax.tick_params(labelsize=17); ax.set_xlabel('Interval Length', fontsize=19); ax.set_ylabel('Fraction', fontsize=19); 
    ax2.set_ylabel('Cumulative Fraction', fontsize=19); ax2.set_ylim(0, 1.05)
    ax.legend(fontsize=15, loc=(0.05, 0.875), ncol=3)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_intervals_test.jpg'), bbox_inches='tight', dpi=300)
    
    # Now the Distribution of Behaviours in the Wasted Category.
    fig, ax = plt.subplots(1, 1, figsize=[15, 6], tight_layout=True)
    error_code = error_code.to_frame('Waste').join(behaviours, how='left')
    wasted = pd.DataFrame({
        'Model': error_code.loc[error_code['Waste'] == 'Wasteful', 'GT.Behaviour'].value_counts(),
        'All': error_code['GT.Behaviour'].value_counts()
    }).drop(0)
    nominal = wasted['Model'].sum() / wasted['All'].sum()
    wasted.loc[:, 'Model'] /= wasted['All']
    mplext.autolabel_bar(wasted['Model'].plot.bar(ax=ax).containers[0], None, 15, ax=ax)
    ax.axhline([nominal], ls='--', lw=3, label=f'Nominal = {nominal:.3f}')
    ax.legend(fontsize=16, ncol=2, loc=2)
    ax.set_xticks(np.arange(7)); ax.set_xlim(-0.5, 6.5); #ax.set_ylim(0, 0.25)
    ax.set_xticklabels(BORISParser.BEHAVIOURS(simplified=True).values(), rotation=30, ha='right'); ax.tick_params(labelsize=16)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_behwaste_test.jpg'), bbox_inches='tight', dpi=300)

##### Comments
 1. The Intervals seem to follow the previous trend: 
     * Most errors are short-lived.
     * There are some long-lived unreliables (36s, 40s, 51s, 55s, 93s and 120s) which is problematic: however, these are probably instances of where the mice are all immobile.
 2. The wasted behaviours seems a bit off from the previous:
     * This time, there are quite a few behaviours with higher-than-nominal wastage.
     * The wastage on micro-motion is high, but this may not be a very significant behaviour.
     * The wastage on Allo-grooming is more concerning.
     * All this suggests dataset shift or a very brittle classifier.

#### 5.2.2 Comparison with NB (for our sake)

In [None]:
if EVALUATE_TEST:
    # Prepare
    error_nb = (mdls['NB'].predict(X['Test']) * 10 + y['Test']).map({0: 'Correct', 1: 'Wasteful', 10: 'Unreliable', 11: 'Correct'})
    error_nb = error_nb.to_frame('Waste').join(behaviours, how='left')
    
    # Compute
    wasted = {}
    for mdl, errors in zip(('OC', 'NB'), (error_code, error_nb)):
        wasted[mdl] = errors.loc[errors['Waste'] == 'Wasteful', 'GT.Behaviour'].value_counts()
        wasted['All'] = errors['GT.Behaviour'].value_counts()
    wasted = pd.DataFrame(wasted).drop(0)
    nominal = wasted.sum().drop('All') / wasted.sum()['All']
    wasted = (wasted.T.drop('All') / wasted['All']).fillna(0)
    # Plot
    fig, ax = plt.subplots(1, 1, figsize=[15, 6], tight_layout=True)
    bars = wasted.T.plot.bar(ax=ax).containers
    for mdl, bb in zip(wasted.index, bars):
        mplext.autolabel_bar(bb, [f'{f:.2f}' for f in wasted.T[mdl]], 13, ax=ax)
        ax.axhline([nominal[mdl]], c=bb[0].get_facecolor(), ls='--', lw=3, label=f'{mdl} Overall = {nominal[mdl]:.2f}')
    ax.legend(fontsize=16, ncol=2)
    ax.set_xticklabels(BORISParser.BEHAVIOURS(simplified=True).values(), rotation=30, ha='right'); ax.tick_params(labelsize=16)
    plt.savefig(os.path.join(BASE_RESULTS, FIGURES, 'fig_behwaste_test_compare.jpg'), bbox_inches='tight', dpi=300)