In [1]:
import numpy as np
import random
import pandas as pd
import cv2

After the sea lions coordinates extraction we proceeded to the extraction of the background patches coordinates. To do so we divided each image in a grid composed of 96x96 cells, then we mapped the sea lions coordinates of the image on the overlapping cells in the grid to exclude them. After that we simply retrived the center of all the remaining cells.

In [2]:
# Use a random seed
random.seed(42)

In [3]:
# Read file with sea lions patches coordinates
sealions_train_df = pd.read_csv('./sealions_train.csv')
sealions_validation_df = pd.read_csv('./sealions_validation.csv')
sealions_test_df = pd.read_csv('./sealions_test.csv')

In [4]:
def get_empty_coordinates(sealions_df):
    empty = []
    file_names = sealions_df.filename.unique()
    # Loop on each file
    for file in file_names:
        # Select rows from data frame with same name file
        df = sealions_df[sealions_df['filename'] == file]
        # Get image size
        image = cv2.imread("./kaggle_sea_lions/Train/" + file)
        IM_SIZE_Y, IM_SIZE_X, _ = image.shape
        # Create array for coordinates to exclude
        exclude = []
        # Create array for empty patches
        empty_patches_coord = []
        # Loop on each row of the specified file
        for row in df.iterrows():
            # Select coordinates for patches where sea lions are present
            p_1 = ((((int(row[1]['coord_x']) + 48) // 96) * 96) + 48, (((int(row[1]['coord_y']) + 48) // 96) * 96) + 48)
            p_2 = ((((int(row[1]['coord_x']) + 48) // 96) * 96) + 48, (((int(row[1]['coord_y']) - 48) // 96) * 96) + 48)
            p_3 = ((((int(row[1]['coord_x']) - 48) // 96) * 96) + 48, (((int(row[1]['coord_y']) + 48) // 96) * 96) + 48)
            p_4 = ((((int(row[1]['coord_x']) - 48) // 96) * 96) + 48, (((int(row[1]['coord_y']) - 48) // 96) * 96) + 48)
            # Add coordinates to list
            exclude.append(p_1)
            exclude.append(p_2)
            exclude.append(p_3)
            exclude.append(p_4)
        # Generate coordinates of all square patches 96x96 such that they don't intersect between themselves
        for i in range(48, IM_SIZE_X - 48, 96):
            for j in range(48, IM_SIZE_Y - 48, 96):
                empty_patches_coord.append((i,j))
        # Keep patches that does not contain sea lions with sets difference
        empty_patches_coord = list(set(empty_patches_coord) - set(exclude))
        for coord in empty_patches_coord:
            empty.append(list((int(coord[0]), int(coord[1]), "background", file)))
    return empty


empty_train = get_empty_coordinates(pd.concat([sealions_train_df, sealions_validation_df]))
empty_test = get_empty_coordinates(sealions_test_df)

As before we saved the coordinates in 3 .csv files: one for training, one for validation and the last for testing (the division of the patches was based on the original images of provenience between train and test, while the validation set was extracted by random sampling of the test set)

In [5]:
(length, _) = sealions_validation_df.shape
random.shuffle(empty_train)
empty_validation = empty_train[:length]
empty_train = empty_train[length:]

empty_train_df = pd.DataFrame(data=empty_train, columns=["coord_x", "coord_y", "class", "filename"])
empty_train_df.to_csv("./empty_train.csv", index=False)

empty_validation_df = pd.DataFrame(data=empty_validation, columns=["coord_x", "coord_y", "class", "filename"])
empty_validation_df.to_csv("./empty_validation.csv", index=False)

empty_test_df = pd.DataFrame(data=empty_test, columns=["coord_x", "coord_y", "class", "filename"])
empty_test_df.to_csv("./empty_test.csv", index=False)