In [2]:
!pip install numpy==1.21.3

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import cv2
import matplotlib.pyplot as plt
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
from PIL import Image, ImageEnhance
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading Data

In [4]:
DATA_DIR = "/kaggle/input/sartorius-cell-instance-segmentation"

# LIVECELL extra dataset
LIVECELL_DIR = os.path.join(DATA_DIR, "LIVECelldataset_2021")
LIVECELL_ANN_DIR = os.path.join(LIVECELL_DIR, "annotations")
LIVECELL_IMG_DIR = os.path.join(LIVECELL_DIR, "images")

# Dataset
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TEST_DIR = os.path.join(DATA_DIR, "test")
SEMI_DIR = os.path.join(DATA_DIR, "train_semi_supervised")
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")

In [28]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

## Inspecting train dataframe

In [6]:
train_df.info()

In [7]:
train_df.shape

In [8]:
train_df['cell_type'].unique()

## Visiualize distribution

In [9]:
def plot_distribution(x):
    fig = px.histogram(
    train_df, 
    x = x,
    width = 800,
    height = 500)
    fig.show()

In [10]:
plot_distribution('cell_type')

In [11]:
plot_distribution('plate_time')

In [12]:
plot_distribution('elapsed_timedelta')

In [13]:
plot_distribution('id')

## Analyse statistic of images and annotations

In [14]:
train_df['cell_type'].value_counts()

In [15]:
train_df.nunique()

In [16]:
train_df['id'].value_counts().describe()

In [17]:
train_df[train_df['cell_type'] == 'shsy5y'].nunique()

In [18]:
train_df[train_df['cell_type'] == 'astro'].nunique()

In [19]:
train_df[train_df['cell_type'] == 'cort'].nunique()

In [20]:
train_df[train_df['cell_type'] == 'shsy5y']['id'].value_counts().describe()

In [21]:
train_df[train_df['cell_type'] == 'astro']['id'].value_counts().describe()

In [22]:
train_df[train_df['cell_type'] == 'cort']['id'].value_counts().describe()

## Data Summary
* There are three classes: shsy5y, astro, cort
* There are 606 training images
* There are 240 hidden test images
* All images has width of 704, height of 520
* Number of annotations: shsy5y=52286, astro=10522, cort=10777, total=73585
    * This is an imbalance dataset
    * There are in average 121 labels of cell per image
        * Average labels/image for shsy5y: 337
        * Average labels/image for astro: 80
        * Average labels/image for cort: 33
* There are three types of cells represented in the images, but only one type per image
    * There are 320 images with CELL_TYPE=cort
    * There are 155 images with CELL_TYPE=shsy5y
    * There are 131 images with CELL_TYPE=astro

## Visualize dataset

### Helper Functions
From https://www.kaggle.com/dschettler8845/sartorius-segmentation-eda-effdet-tf-train#helper_functions
- RLE encoder/decoder
- Visualizer

In [23]:
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# modified from: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, shape, color=1):
    """ TBD
    
    Args:
        mask_rle (str): run-length as string formated (start length)
        shape (tuple of ints): (height,width) of array to return 
    
    Returns: 
        Mask (np.array)
            - 1 indicating mask
            - 0 indicating background

    """
    # Split the string by space, then convert it into a integer array
    s = np.array(mask_rle.split(), dtype=int)

    # Every even value is the start, every odd value is the "run" length
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths

    # The image image is actually flattened since RLE is a 1D "run"
    if len(shape)==3:
        h, w, d = shape
        img = np.zeros((h * w, d), dtype=np.float32)
    else:
        h, w = shape
        img = np.zeros((h * w,), dtype=np.float32)

    # The color here is actually just any integer you want!
    for lo, hi in zip(starts, ends):
        img[lo : hi] = color
        
    # Don't forget to change the image back to the original shape
    return img.reshape(shape)

def get_img_and_mask(img_path, annotation, width, height, mask_only=False, rle_fn=rle_decode):
    """ Capture the relevant image array as well as the image mask """
    img_mask = np.zeros((height, width), dtype=np.uint8)
    for i, annot in enumerate(annotation): 
        img_mask = np.where(rle_fn(annot, (height, width))!=0, i, img_mask)
    
    # Early Exit
    if mask_only:
        return img_mask
    
    # Else Return images
    img = cv2.imread(img_path)[..., ::-1]
    return img[..., 0], img_mask

def plot_img_and_mask(img, mask, bboxes=None, invert_img=True, boost_contrast=True):
    """ Function to take an image and the corresponding mask and plot
    
    Args:
        img (np.arr): 1 channel np arr representing the image of cellular structures
        mask (np.arr): 1 channel np arr representing the instance masks (incrementing by one)
        bboxes (list of tuples, optional): (tl, br) coordinates of enclosing bboxes
        invert_img (bool, optional): Whether or not to invert the base image
        boost_contrast (bool, optional): Whether or not to boost contrast of the base image
        
    Returns:
        None; Plots the two arrays and overlays them to create a merged image
    """
    plt.figure(figsize=(20,10))
    
    plt.subplot(1,3,1)
    _img = np.tile(np.expand_dims(img, axis=-1), 3)
    
    # Flip black-->white ... white-->black
    if invert_img:
        _img = _img.max()-_img
    
    if boost_contrast:
        _img = np.asarray(ImageEnhance.Contrast(Image.fromarray(_img)).enhance(16))
    
    if bboxes:
        for i, bbox in enumerate(bboxes):
            mask = cv2.rectangle(mask, bbox[0], bbox[1], (i+1, 0, 0), thickness=2)
    
    plt.imshow(_img)
    plt.axis(False)
    plt.title("Cell Image", fontweight="bold")
    
    plt.subplot(1,3,2)
    _mask = np.zeros_like(_img)
    _mask[..., 0] = mask
    plt.imshow(mask, cmap="inferno")
    plt.axis(False)
    plt.title("Instance Segmentation Mask", fontweight="bold")
    
    merged = cv2.addWeighted(_img, 0.75, np.clip(_mask, 0, 1)*255, 0.25, 0.0,)
    plt.subplot(1,3,3)
    plt.imshow(merged)
    plt.axis(False)
    plt.title("Cell Image w/ Instance Segmentation Mask Overlay", fontweight="bold")
    
    plt.tight_layout()
    plt.show()
    


In [29]:
# Prepare dataframe for visualizing
train_df = pd.read_csv(TRAIN_CSV)
train_df["img_path"] = train_df["id"].apply(lambda x: os.path.join(TRAIN_DIR, x + ".png")) # Capture Image Path As Well
tmp_df = train_df.drop_duplicates(subset=["id", "img_path"]).reset_index(drop=True)
tmp_df["annotation"] = train_df.groupby("id")["annotation"].agg(list).reset_index(drop=True)
train_df = tmp_df.copy()
train_df.head()

In [30]:
index = 0
display(train_df.iloc[index])
params = train_df[["img_path", "annotation", "width", "height"]].iloc[index].to_dict()
img, msk = get_img_and_mask(**params)
plot_img_and_mask(img, msk)

## Cell: shsy5y 

In [39]:
for i in range(10):
    img, msk = get_img_and_mask(**train_df[train_df.cell_type=='shsy5y'][["img_path", "annotation", "width", "height"]].sample(10).reset_index(drop=True).iloc[i].to_dict())
    plot_img_and_mask(img, msk)

## Cell: astro 

In [40]:
for i in range(10):
    img, msk = get_img_and_mask(**train_df[train_df.cell_type=='astro'][["img_path", "annotation", "width", "height"]].sample(10).reset_index(drop=True).iloc[i].to_dict())
    plot_img_and_mask(img, msk)

## Cell: cort 

In [41]:
for i in range(10):
    img, msk = get_img_and_mask(**train_df[train_df.cell_type=='cort'][["img_path", "annotation", "width", "height"]].sample(10).reset_index(drop=True).iloc[i].to_dict())
    plot_img_and_mask(img, msk)

## Visualization Summary
* Image with SHSY5Y
    - Very dense, lot of cells per image
    - Come in a variety of shapes
    - Lots of overlapping cells
    - Tends to be small relative to image size
* Image with ASTRO
    - The cell is large relative to image size
    - The shape tends to be thin and long
    - Contains Higly overlapping cells
* Image with CORT
    - The cell is small relative to image size
    - Sparse, not a lot of cells per image
    - The shape is round

## Overall Summary

* There are three classes: shsy5y, astro, cort
* There are 606 training images
* There are 240 hidden test images
* All images has width of 704, height of 520
* Number of annotations: shsy5y=52286, astro=10522, cort=10777, total=73585
    * This is an imbalance dataset
    * There are in average 121 labels of cell per image
        * Average labels/image for shsy5y: 337
        * Average labels/image for astro: 80
        * Average labels/image for cort: 33
* There are three types of cells represented in the images, but only one type per image
    * There are 320 images with CELL_TYPE=cort
    * There are 155 images with CELL_TYPE=shsy5y
    * There are 131 images with CELL_TYPE=astro


* Image with SHSY5Y
    - Very dense, lot of cells per image
    - Come in a variety of shapes
    - Lots of overlapping cells
    - Tends to be small relative to image size
* Image with ASTRO
    - The cell is large relative to image size
    - The shape tends to be thin and long
    - Contains Higly overlapping cells
* Image with CORT
    - The cell is small relative to image size
    - Sparse, not a lot of cells per image
    - The shape is round