# Generate the Features/Targets for the Observability Classifier

## 0. Scope

This Script extracts the Feature-Vectors and Target Labels for the Observability Classifier

### 0.1 Feature-Set
 * Uses Three features: TIM Detections, LFB Feature-Vectors and RFID Positions.
 
### 0.2 Requires
 1. Features:
     * TIM Detections (as provided by the TIM pipeline)
     * LFB Features (as generated by the LFB model on End2End Data [use `MMAction/bash/generate_fb.sh`])
     * RFID Pickups (as provided by MRC Harwell)
 2. Targets:
     * Observability Class
 3. Other Info:
     * List of Segments
     * Ground-Truth AVA Data (as generated using `Data/Generate_Q1_Data/Extract_AVA_Data_Format.ipynb`)
     
### 0.3 Filtering
 * Discards Inadmissable Samples: i.e. those which are either:
     * Tentative/Unidentified as per GT.Behaviour
     * Ambiguous as per GT.Observable
     
### 0.4 Outputs
This follows a simple structure:
 * One DataFrame each for Train and Test sets
 * Within each, there are Features & Targets

In [None]:
from mpctools.extensions import utils, pdext, cvext 
from IPython.core.display import display, HTML
from mpctools.parallel import ProgressBar
import pandas as pd
import numpy as np
import torch
import sys
import os

from Tools.Parsers import SnippetParser, BORISParser, RFIDParser
from Tools.Features import ObservabilityFeatures

# Display Options
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
# === Data Specifics === #
BTI = 25
SNIPPET_BTI_MULT = 60
SNIPPET_BTI_LEN = 120
MICE = {0: 'R', 1: 'G', 2: 'B'}
OBSERVED = BORISParser.obs2int('Obs')

In [None]:
# === Functions === #
def average_area(bboxes):
    """
    Return Average of the Areas of the Available BBoxes (i.e. missing bboxes do not contribute)
    """
    bboxes = bboxes.dropna()
    return np.nanmean([bb.area() for bb in bboxes]) if len(bboxes) > 0 else 0

def average_bbox(bboxes):
    """
    Returns the Average BBox over the BTI
    """
    bboxes = bboxes.dropna()
    return cvext.average_bbox(bboxes) if len(bboxes) > 0 else np.NaN

def average_iou(bboxes):
    """
    Returns the average IoU between adjacent BBoxes
    """
    bboxes = bboxes.dropna()
    if len(bboxes) > 1:
        return np.mean([b1.iou(b2) for b1, b2 in utils.window(bboxes, 2)])
    elif len(bboxes) == 1:
        return 0
    else:
        return np.NaN
    
def per_row_neighbours(row):
    """
    Computers Neighbourhood per Sample
    """
    return RFIDParser.occupancy(row['RFID'], row.drop('RFID')).ravel()

def per_grp_neighbours(grp):
    """
    Aggregates neighbourhoods over BTI
    """
    return grp.drop(columns=grp.index.unique(2)[0]).apply(per_row_neighbours, axis=1).sum()

## 1. Load Data

### 1.1 Ready-Made Data

In [None]:
# Summary Data
segments = pd.read_pickle(SEGMENT_LIST, compression='bz2')
snippets = pd.read_pickle(SNIPPET_LIST, compression='bz2')

# Target
annotations = pd.read_pickle(ANNOTATIONS, compression='bz2').stack(0)
annotations = annotations[annotations['GT.Admissible']]
observables = (annotations['GT.Observable'] == OBSERVED).to_frame('Observable').join(snippets['DataSet.Fixed'].fillna('Test'))

# Feature Sources
behave_path = os.path.join(snippets.iloc[0]['Path.Drive'], snippets.iloc[0]['Path.Dir.B'], 'End2End', 'All')
lfb_sources = os.path.join(snippets.iloc[0]['Path.Drive'], snippets.iloc[0]['Path.Dir.O'], 'LFB')

# Output Path
output_path = os.path.join(snippets.iloc[0]['Path.Drive'], snippets.iloc[0]['Path.Dir.O'], 'Features')

### 1.2 Detections

This will generate two-sets of features:
   1. From TIM: the mean number of detections, mean area of BBox and average BBox.
   2. From RFID: antenna (mode over BTI), neighbourhood occupancy (summed over BTI)

In [None]:
loc_features = {'Tune': {}, 'Test': {}}
progress = ProgressBar(len(snippets), prec=2).reset('Extracting Features:')
for seg_idx, seg_grp in snippets.groupby(level=(0, 1)):
    # Get-Segment level Data
    seg = pdext.dfmultiindex(segments, 2, seg_idx[0]); seg = seg[seg['Segment'] == seg_idx[1]].iloc[0]  # Get Relevant Segment
    ds = 'Test' if seg['DataSet'] == 'Test' else 'Tune'                                                 # Define DataSet
    # Retrieve Information
    bbs = SnippetParser.seg2boxes(seg, 'Trk.BB.KFs').rename_axis(['Frm', 'Mouse'])                        # Get TIM (we do not care about which col, just presence)
    bbs = RFIDParser(seg).antennas(True).stack().rename_axis(['Frm', 'Mouse']).to_frame('RFID').join(bbs) # Get RFIDs (as Antennas) and join (on RFID to ensure all)
    bbs['BTI'] = np.floor(bbs.index.get_level_values(0)/BTI).astype(int)                                  # Convert Frames to BTIs
    bbs = bbs.set_index('BTI', append=True).reorder_levels((2, 1, 0))                                     # Set Indices for convenience
    # Iterate over snippets within segment
    for snip_idx in seg_grp.index.get_level_values(-1): 
        # Resolve Snippet
        _start_bti = snip_idx * SNIPPET_BTI_MULT                           # Relative Start of Snippet BTI
        snip_info = bbs.loc[_start_bti: _start_bti + SNIPPET_BTI_LEN - 1]  # Get relevant snippet
        # Generate TIM Features
        _dets = snip_info['BB'].groupby(level=(0, 1)).count().to_frame('TIM.Dets')               # Number of Detections
        _area = snip_info['BB'].groupby(level=(0, 1)).apply(average_area).to_frame('TIM.Area')   # Average Area (ensuring No NaN)
        _bbox = snip_info['BB'].groupby(level=(0, 1)).apply(average_bbox).to_frame('TIM.BB')     # Average BBox (NaNs allowed)          
        _ious = snip_info['BB'].groupby(level=(0, 1)).apply(average_iou).to_frame('TIM.IoU')     # Average IoU (NaNs allowed)          
        # Generate RFID Features
        _pos = snip_info['RFID'].groupby(level=(0, 1)).agg(lambda grp: grp.mode().iloc[0]).to_frame('RFID.Pos')
        _ndf = snip_info[['RFID']].join(snip_info['RFID'].unstack(1)).groupby(level=(0, 2)).apply(per_grp_neighbours).to_frame('RFID.NHood')
        # Store
        loc_features[ds][(*seg_idx, snip_idx)] = pd.concat([_dets, _area, _bbox, _ious, _pos, _ndf], axis=1).rename(index=lambda x: x - _start_bti, level=0)
        progress.update()

### 1.3 LFB Features

#### 1.3.1 Extract the LFB Features

In [None]:
lfb_features = {}
# Load All
for ds in ('Tune', 'Test'):
    # Load Data
    lfb = torch.load(os.path.join(lfb_sources, f'lfb_{ds}.fix.pkl'), map_location='cpu')
    gts = pd.read_csv(os.path.join(behave_path, ds, 'AVA.Behaviours.csv'), header=None)[[0, 1, 7]]
    lfb_features[ds] = []
    # Create DataFrame
    progress = ProgressBar(len(lfb)).reset(ds)
    for vid, vid_feat in lfb.items():
        cid, seg, snip = [int(v) for v in vid.split('_')]
        for bti, bti_feat in vid_feat.items():
            for m, feat in enumerate(bti_feat):
                idx = len(lfb_features[ds]); assert (gts.iloc[idx][0] == vid) & (gts.iloc[idx][1] == bti)
                lfb_features[ds].append({'CageID': cid, 'Segment': seg, 'Snippet': snip, 'BTI': bti, 'Mouse': MICE[gts.iloc[idx][7]], 'LFB.Raw': feat.numpy()})
        progress.update()
    lfb_features[ds] = pd.DataFrame(lfb_features[ds]).set_index(['CageID', 'Segment', 'Snippet', 'BTI', 'Mouse'])

## 2. Construct Feature/Target Set

In [None]:
for ds, grp in zip(('Tune', 'Test'), (('Train', 'Validate'), ('Test',))):
    _loc = pd.concat(loc_features[ds], names=['CageID', 'Segment', 'Snippet'])
    _obs = observables.loc[observables['DataSet.Fixed'].isin(grp), 'Observable']
    _df = pd.concat([_obs, _loc.join(lfb_features[ds])], axis=1, keys=['Target', 'Features']).dropna(subset=[('Target', 'Observable')])
    _df.to_pickle(os.path.join(output_path, f'{ds}.fix.df'), compression='bz2')

In [None]:
# Construct normaliser
normaliser = ObservabilityFeatures(BTI, 60000, 1.5, 18, 30, as_frame=True).fit(data['Tune']['Features'])
joblib.dump(normaliser, os.path.join(BASE_RESULTS, MODELS, 'Pipeline', 'FeatureXtract.jlib'))