In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import skimage.feature
import pandas as pd
%matplotlib inline

In [2]:
# data initialization
mismatched = [3, 7, 9, 21, 30, 34, 71, 77, 81, 89, 97, 151, 184, 215, 234, 242, 268, 290, 311, 331, 344,
              380, 384, 406, 421, 469, 475, 489, 490, 499, 507, 530, 531, 593, 605, 607, 614, 621, 638,
              644, 687, 712, 721, 767, 779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909,
              913, 927, 946]
# other = [40, 287, 330, 468, 473, 616, 761, 813, 873, 912, 921]
misclassified = [280, 338, 816]
bad_images = mismatched + misclassified
file_names_train = [str(x) + '.jpg' for x in range(0, 750) if x not in bad_images]
file_names_test = [str(x) + '.jpg' for x in range(751, 947) if x not in bad_images]
classes = ["adult_males", "subadult_males", "adult_females", "juveniles", "pups"]
coordinates_train_df = pd.DataFrame(index=file_names_train, columns=classes)
coordinates_test_df = pd.DataFrame(index=file_names_test, columns=classes)

In [3]:
import joblib
from joblib import Parallel, delayed
from functools import reduce

def get_coordinates(filename):
    sealions = []
    # get coordinates
    # read the Train and Train Dotted images
    image_1 = cv2.imread("./kaggle_sea_lions/TrainDotted/" + filename)
    image_2 = cv2.imread("./kaggle_sea_lions/Train/" + filename)

    # absolute difference between Train and Train Dotted
    image_3 = cv2.absdiff(image_1, image_2)

    # mask out blackened regions from Train Dotted
    mask_1 = cv2.cvtColor(image_1, cv2.COLOR_BGR2GRAY)
    mask_1[mask_1 < 20] = 0
    mask_1[mask_1 > 0] = 255

    mask_2 = cv2.cvtColor(image_2, cv2.COLOR_BGR2GRAY)
    mask_2[mask_2 < 20] = 0
    mask_2[mask_2 > 0] = 255

    image_3 = cv2.bitwise_or(image_3, image_3, mask=mask_1)
    image_3 = cv2.bitwise_or(image_3, image_3, mask=mask_2)

    # convert to grayscale to be accepted by skimage.feature.blob_log
    image_3 = cv2.cvtColor(image_3, cv2.COLOR_BGR2GRAY)

    # detect blobs
    blobs = skimage.feature.blob_log(image_3, min_sigma=3, max_sigma=4, num_sigma=1, threshold=0.02)

    #adult_males = []
    #subadult_males = []
    #pups = []
    #juveniles = []
    #adult_females = []

    for blob in blobs:
        # get the coordinates for each blob
        y, x, s = blob
        # get the color of the pixel from Train Dotted in the center of the blob
        g, b, r = image_1[int(y)][int(x)][:]

        # decision tree to pick the class of the blob by looking at the color in Train Dotted
        if r > 200 and g < 50 and b < 50:  # RED
            #adult_males.append((int(x), int(y)))
            sealions.append((int(x), int(y), "adult_male", filename))
        elif r > 200 and g > 200 and b < 50:  # MAGENTA
            #subadult_males.append((int(x), int(y)))
            sealions.append((int(x), int(y), "subadult_male", filename))
        elif r < 100 and g < 100 and 150 < b < 200:  # GREEN
            #pups.append((int(x), int(y)))
            sealions.append((int(x), int(y), "pup", filename))
        elif r < 100 and  100 < g and b < 100:  # BLUE
            #juveniles.append((int(x), int(y)))
            sealions.append((int(x), int(y), "juvenile", filename))
        elif r < 150 and g < 50 and b < 100:  # BROWN
            #adult_females.append((int(x), int(y)))
            sealions.append((int(x), int(y), "adult_female", filename))

    #coordinates_df["adult_males"][filename] = adult_males
    #coordinates_df["subadult_males"][filename] = subadult_males
    #coordinates_df["adult_females"][filename] = adult_females
    #coordinates_df["juveniles"][filename] = juveniles
    #coordinates_df["pups"][filename] = pups
    return sealions


sealions_train = Parallel(n_jobs=4, verbose=1, backend="threading")(map(delayed(get_coordinates), file_names_train))
sealions_train = reduce(lambda x, y: x+y, sealions_train)
sealions_test = Parallel(n_jobs=4, verbose=1, backend="threading")(map(delayed(get_coordinates), file_names_test))
sealions_test = reduce(lambda x, y: x+y, sealions_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.1min


[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  4.9min


[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 11.3min


[Parallel(n_jobs=4)]: Done 705 out of 705 | elapsed: 17.9min finished


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.1min


[Parallel(n_jobs=4)]: Done 178 out of 178 | elapsed:  4.4min finished


In [4]:
# save coordinates
length = len(sealions_train)
sealions_validation = sealions_train[int(length*0.8):]
sealions_train = sealions_train[:int(length*0.8)]

sealions_train_df = pd.DataFrame(data=sealions_train, columns=["coord_x", "coord_y", "class", "filename"])
sealions_train_df.to_csv("./sealions_train.csv", index=False)

sealions_validation_df = pd.DataFrame(data=sealions_validation, columns=["coord_x", "coord_y", "class", "filename"])
sealions_validation_df.to_csv("./sealions_validation.csv", index=False)

sealions_test_df = pd.DataFrame(data=sealions_test, columns=["coord_x", "coord_y", "class", "filename"])
sealions_test_df.to_csv("./sealions_test.csv", index=False)