# Outlier Detection

The goal is to remove outlier cells from the ```hpacellseg``` output.

Outliers on the training set:
* (Shape) Cells where the minimum bounding rectangle has a (h,w) ratio outside of 95% of the data range.
* (Shape) Cells that are very large compared to the image size or the other cells in the image. (?)
* (Shape) TBD: Cells where the nucleus is outside 95% quantile to distance to center. (deformed cells?)
* (Color) Cells that have atypical mean and std in their image channels.
* (Position) Cells that are touching the edge of the image.
* (Position) TBD: Cells where the nucleus is missing, or intersecting with the edge of the image.

Outliers on the testing set:
* (Position) TBD: Cells where the nucleus is missing, or intersecting with the edge of the image.

In [None]:
import os
import importlib

import numpy
import pandas
import sklearn
import matplotlib.pyplot as plt

import cv2
import skimage
import pycocotools

import json
import ast

import src.utils
importlib.reload(src.utils)

from tqdm import tqdm
import multiprocessing, logging
from joblib import Parallel, delayed

In [None]:
train = pandas.read_csv("./data/train_cells.csv")
train.head()

Functions for parsing precomputed and compressed train and test dataset rles.

In [23]:
def get_rle_from_df(row):
    string = row.RLEmask
    h = row.ImageHeight
    w = row.ImageWidth
    rle = src.utils.decode_b64_string(string, h, w)
    return rle
    
def get_mask_from_rle(rle):
    mask = pycocotools._mask.decode([rle])[:,:,0]
    return mask

In [24]:
rles = train.apply(get_rle_from_df, axis=1)
rles.head()

In [25]:
rles.head()

0    {'counts': b'dWTd1h0Wo12N2L6Bb0H2N2L6K9H2N2N4K...
1    {'counts': b'd^d6c0\\o12N2L4D=M2N2L6WL1`VNl0]i...
2    {'counts': b'ieZW11ko15Mh0ZO3M1O1O1O1O4LV1jN01...
3    {'counts': b'dQea0l1Sn12N2L6jMjNmTNT2cj1nM[UNT...
4    {'counts': b'TboU14lo10O200O6J00100N<UOb0N1O2N...
dtype: object

In [None]:
masks = rles.apply(get_mask_from_rle)
masks.head()

### Generate Outlier Metrics
Calculate the **bounding box**.

In [None]:
def get_bbox_from_rle(rle):
    """x,y = bottom left!"""
    bbox = pycocotools._mask.toBbox([encoded_mask])[0]
    x, y, w, h = (int(l) for l in bbox)
    return x, y, w, h

Calculate the **minimum bounding rectangle** (rotated bounding box).

In [None]:
def get_mbr_from_mask(mask):
    return x, y, l1, l2, phi

In [None]:
def get_hw_from_mbr(mbr):
    return h, w

In [None]:
if not n_workers: n_workers=num_cores
processed_list = Parallel(n_jobs=int(n_workers))(
        delayed(segment_image)(i, segmentator, images_frame, test) for i in tqdm(images)
    )

In [None]:
touch = train.touches.apply(ast.literal_eval)