# requirements

In [None]:
import numpy as np
import pandas as pd
import os
from glob import glob
import tensorflow as tf
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
from pathlib import Path
from glob import glob
import pandas as pd
import cv2
from tqdm import tqdm
import albumentations as A 
from typing import Tuple
import keras.backend as K
from sklearn.model_selection import train_test_split
from skimage.io import imread, imshow, concatenate_images
from skimage import io, transform
from skimage.measure import label, regionprops
import seaborn as sns

In [None]:
PROJECT_PATH = "../"

# utils

In [None]:
def rle_decode(mask_rle, shape=(768, 768)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 255
    return img.reshape(shape).T  # Needed to align to RLE direction
def apply_masks_to_img(img, _id, df):
    '''Apply masks to image given img, its id and the dataframe.'''
    masks = df[df.ImageId == _id].EncodedPixels.apply(lambda x: rle_decode(x)).tolist()
    masks = sum(masks)
    return img * masks.reshape(img.shape[0], img.shape[1], 1)
def get_img(imgid):
    '''Return image array, given ID.'''
    path = Path(f'{PROJECT_PATH}/data/train_v2/') / '{}'.format(imgid)
    return plt.imread(path)
#apply a particluar mask over the image 
def apply_mask(image, mask):
    image = image.copy()
    xs,ys = np.where(mask==255)
    for x, y in zip(xs,ys):
        image[x, y, [0,1]] = 255
    return image
def get_mask(image_id):
    rle_code = self.dataframe_labels[self.dataframe_labels["ImageId"]==image_id]["EncodedPixels"].values
    rle_codes = df_train[df_train["image_path"]==df_train.iloc[3]["image_path"]]["EncodedPixels"].values
    mask = np.zeros((768,768),dtype=np.uint8)
    for rle_code in rle_codes:
        mask_ship = rle_to_mask(rle_code)
        mask = cv2.bitwise_or(mask,mask_ship)
def show_pixels_distribution(df):
    """
    Prints the amount of ship and no-ship pixels in the df
    """
    # Total images in the df
    n_images = df['ImageId'].nunique() 
    
    # Total pixels in the df
    total_pixels = n_images * 768 * 768 

    # Keep only rows with RLE boxes, transform them into list of pixels, sum the lengths of those lists
    ship_pixels = df['EncodedPixels'].dropna().apply(rle_decode).str.len().sum() 

    ratio = ship_pixels / total_pixels
    print(f"Ship: {round(ratio, 3)} ({ship_pixels})")
    print(f"No ship: {round(1 - ratio, 3)} ({total_pixels - ship_pixels})")
    
def rle_codes_to_mask(rle_codes,image_size):
    mask = np.zeros(image_size,dtype=np.uint8)
    for rle_code in rle_codes:
        mask_ship = rle_decode(rle_code)
        mask = cv2.bitwise_or(mask,mask_ship)
    return mask

# load dataset

In [None]:
train = pd.read_csv(f"{PROJECT_PATH}/data/train_ship_segmentations_v2.csv")

# Initial statistics

Let`s find the number of images in both classes

In [None]:
ships = train[~train.EncodedPixels.isna()].ImageId.unique()
noships = train[train.EncodedPixels.isna()].ImageId.unique()

plt.bar(['Ships', 'No Ships'], [len(ships), len(noships)]);
plt.ylabel('Number of Images');

We are observing highly imbalanced classes. Some weighted-robust metrics will be approriate to use 

In [None]:
df_train = train
train_files = os.listdir(f"{PROJECT_PATH}/data/train_v2/")
test_files = os.listdir(f"{PROJECT_PATH}/data/test_v2/")
# Count number of ships per image
df_wships = df_train.dropna()
df_wships = df_wships.groupby('ImageId').size().reset_index(name='counts')
df_woships = df_train[df_train['EncodedPixels'].isna()]

print('Number of images with ships :     %d \nNumber of images without ships : %d\n  \nProportion: %0.1f\n ' \
      % (df_wships.shape[0], df_woships.shape[0], df_wships.shape[0] / df_woships.shape[0]))


print('Ration with ships:     ' +str(round((df_wships.shape[0]/len(train_files)),2)))
print('Ration without ships:  ' +str(round((df_woships.shape[0]/len(train_files)),2)))

#make plots

plt.figure(figsize=(15, 6))

plt.subplot(1,2,1)
plt.bar(['With ships','Without ships'], [len(df_wships),len(df_woships)], color = ['lightblue','pink'])
plt.ylabel('Number of images')
plt.title('Unbalanced Trainig Data')
plt.grid()

plt.subplot(1,2,2)
plt.bar(['With ships','Without ships'], [len(df_wships)/len(train_files),len(df_woships)/len(train_files)], 
        color = ['lightblue','pink'])
plt.ylabel('Number of images')
plt.title('Unbalanced Trainig Data (Normalized)')
plt.grid()

In [None]:
# Plot histogram
hist = df_wships.hist(bins=np.arange(df_wships['counts'].max())+0.5)
plt.xticks(range(15))
plt.title("Histogram of ships count")
plt.xlabel("Number of ships")
plt.ylabel("Number of images")
plt.show(hist)

# Visualizations

Let`s show a few images

In [None]:
sample = train[~train.EncodedPixels.isna()].sample(25)

fig, ax = plt.subplots(5, 5, sharex='col', sharey='row')
fig.set_size_inches(20, 20)

for i, imgid in enumerate(sample.ImageId):
    col = i % 5
    row = i // 5
    
    path = Path(f'{PROJECT_PATH}/data/train_v2/') / '{}'.format(imgid)
    img = plt.imread(path)
    
    ax[row, col].imshow(img)

In [None]:
# Image visualizations

In [None]:
# Plot some masks
w = 6
h = 2

_, axes_list = plt.subplots(h, w, figsize=(2*w, 2*h))
plt.subplots_adjust(wspace=0.4)
# ax.set(xlim=(0, 768), ylim=(0, 768))
for axes in axes_list:
    for ax in axes:
        ax.axis('auto')
        mask = rle_decode(np.random.choice(df_train.dropna()['EncodedPixels']))
        ax.imshow(mask);

In [None]:
w = 6
h = 2

_, axes_list = plt.subplots(h, w, figsize=(2*w, 2*h))
plt.subplots_adjust(wspace=0.4)
for axes in axes_list:
    for ax in axes:
        ax.axis('off')
        image_id = np.random.choice(df_train.dropna()['ImageId'])
        rle_codes = df_train[df_train["ImageId"]==image_id]['EncodedPixels'].values
        mask = rle_codes_to_mask(rle_codes,(768,768))
        image = plt.imread(f"{PROJECT_PATH}/data/train_v2/{image_id}")
        image = apply_mask(image,mask)
        ax.imshow(image);

We can see that ships are mostly red or gray in color, while sea or ocean is black, green, or blue. Let`s find  color statistics 

# Statistics by color

In [None]:
mask = train.EncodedPixels.isna()

In [None]:
sample_size = 250
sample_images_ids_without_ships = train[mask]["ImageId"].sample(sample_size,random_state=42).values
sample_images_ids_with_ships = train[~mask].drop_duplicates().sample(sample_size,random_state=42)["ImageId"].values

In [None]:
sample_images_without_ships = np.array([get_img(_id) for _id in tqdm(sample_images_ids_without_ships)])
sample_images_with_ships = np.array([get_img(_id) for _id in tqdm(sample_images_ids_with_ships)])

In [None]:
fig, ax = plt.subplots(1, 2, sharex='col')
fig.set_size_inches(20, 6)

for i,(imgs,label) in enumerate(zip([sample_images_with_ships,
                                    sample_images_without_ships],
                                    ["With ships","Without ships"])):
    red = imgs[:, :, :, 0]
    green = imgs[:, :, :, 1]
    blue = imgs[:, :, :, 2]

    ax[i].plot(np.bincount(red.ravel()), color='orangered', label='red', lw=2)
    ax[i].plot(np.bincount(green.ravel()), color='yellowgreen', label='green', lw=2)
    ax[i].plot(np.bincount(blue.ravel()), color='skyblue', label='blue', lw=2)
    ax[i].legend()
    ax[i].title.set_text(label)

As we can see from the plots above, the color distribution is more skewed for the images without ships. Also, the red color has local peaks with green and blue color on the left image

Results (considering the whole dataset):

In [None]:
show_pixels_distribution(df_train)

Results (considering only the images with ships):

In [None]:
show_pixels_distribution(df_train.dropna())

<b>Conclusion:
    
The dataset is highly imbalanced, the training with Unet for the ship segmentation will require carefull selection of losses, including dice, also can be focal and other options. The augmentation can change existing images and help us to learn general distribution. The model needs to be trained for quite a long time for the results, so the concept weights will be presented. 