# Whales identification from their tails - Kaggle contest

## Dependencies

In [None]:
import numpy as np
import glob
import pandas as pd
import os
import shutil
import random
import matplotlib.pyplot as plt
import cv2
import matplotlib.pyplot as plt

## Dataset reading

In [None]:
class Dataset:
    """
    Manage dataset loading
    
    :param dataset_path: str, path to the dataset folder
    """
    
    def __init__(self, dataset_path):
        
        # Build and store dataset paths
        self.dataset_path_train = dataset_path + "/train"
        self.dataset_path_test = dataset_path + "/test"
        
        # Build train.csv path
        self.dataset_path_train_label = dataset_path + "/train.csv"
        
        # Generate pandas dataframe of whales id <-> file matching
        self.dataset_train_label = self.get_train_label()
        
        ## Get pictures paths
#         self.les_im_path_train = self.get_im_path(self.dataset_path_train + "/*.jpg")
#         self.les_im_path_test = self.get_im_path(self.dataset_path_test + "/*.jpg")
        
    def get_im_path(self, dataset_path):
        """
        Get pictures path under the given folder path
        :param dataset_path: str, path to the dataset folder
        
        :output les_im_path: list of string, .jpg picture paths under the dataset_path folder
        """
        print("Getting images path from", dataset_path)
        les_im_path = glob.glob(dataset_path)
        les_im_path.sort()
        return les_im_path
    
    def get_train_label(self):
        """
        Load the train dataset annotation using pandas
        
        :return train_label: pandas dataframe, whales id <-> files matching
        """
        print("Loading", self.dataset_path_train_label)
        return pd.read_csv(self.dataset_path_train_label)
    
    
    def create_folder_is_needed(self, folder_path):
        """
        Create a folder if it doesn't alreadt exist
        :param folder_path: str
        """
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)
    
    def split_in_classes_folders(self, root_classes_folder, remove_new_whale=True, remove_old=True, train_dev_ratio=0.2, crop=False, split=True):
        """
        Split the dataset into classes folders
        :param root_classes_folder:
        :param remove_new_whale: boolean, set to true if not considering new_whale id
        
        OUTPUT:
            pictures sort by whale id into subfolders of the root_classes_folder
        """
        
        if remove_old and os.path.isdir(root_classes_folder):
            print("Removing previous spliting")
            shutil.rmtree(root_classes_folder)
        
        sub_dataset_train_path = "/".join([root_classes_folder, "train"])
        sub_dataset_dev_path = "/".join([root_classes_folder, "dev"])
        
        # Create the folders if needed
        self.create_folder_is_needed(root_classes_folder)
        self.create_folder_is_needed(sub_dataset_train_path)
        self.create_folder_is_needed(sub_dataset_dev_path)
        
        # If passing new whales, should remove previous folder
        if remove_new_whale and os.path.isdir(root_classes_folder):
            print("Removing new_whale folder")
            train_new_whale = "/".join([sub_dataset_train_path, "new_whale"])
            if os.path.isdir(train_new_whale):
                shutil.rmtree(train_new_whale)
            dev_new_whale = "/".join([sub_dataset_dev_path, "new_whale"])
            if os.path.isdir(dev_new_whale):
                shutil.rmtree(dev_new_whale)
        
        # Sorting the images
        files_number = len(self.dataset_train_label)
        if split:
            print("Sorting and preprocessing", files_number, "images into", root_classes_folder)
        else:
            print("Preprocessing", files_number, "images into", root_classes_folder)
        
        for index, row in self.dataset_train_label.iterrows():
            print("#" + str(index + 1) + "/" + str(files_number), end="\r")
            whale_file_name = row['Image']
            if split:
                whale_id = row['Id']
            else:
                whale_id = ""
            if remove_new_whale and whale_id == "new_whale":
                continue
            
            ## Choose if storing in train of dev dataset
            if len(glob.glob("/".join([sub_dataset_train_path, whale_id]))):
                # If 80% uniform and dev aleady exist
                if random.uniform(0, 1) >= train_dev_ratio and len(glob.glob("/".join([sub_dataset_dev_path, whale_id]))):
                    # We store in train dataset
                    sub_dataset_path = sub_dataset_train_path
                else:
                    # We store in dev dataset
                    sub_dataset_path = sub_dataset_dev_path
            else:
                sub_dataset_path = sub_dataset_train_path
            
            # Create folder and generate path name
            self.create_folder_is_needed("/".join([sub_dataset_path, whale_id]).replace("//", "/"))
            orig_path = "/".join([self.dataset_path_train, whale_file_name]).replace("//", "/")
            target_path = "/".join([sub_dataset_path, whale_id, whale_file_name]).replace("//", "/")
            if crop:
                cropped = self.get_convex_masked_tail(orig_path)
                cv2.imwrite(target_path, cropped)
            else:
                shutil.copy(orig_path, target_path)
                
        print("Done" + " "*20)
        
        ## Removing folder that doesn't have a dev equivalent
        for train_path in glob.glob(sub_dataset_train_path + "/*/"):
            in_train_id = train_path.split("/")[-2]
            equivalent_dev_id_path = "/".join([sub_dataset_dev_path, in_train_id])
            if not os.path.isdir(equivalent_dev_id_path):
                shutil.rmtree(train_path)
    
    def disp_image(self, im, gray=False):
        """
        Display an image in a plt plot
        :param im: image, numpy array format, used by cv2
        """
        
        if gray:
            imgplot = plt.imshow(im, cmap='gray')
        else:
            imgplot = plt.imshow(im)
        plt.show()

    
    def dual_threshold(self, im_temp, disp=False):
        
        # Compute lighter threshold
        lighter = np.max(im_temp)
        ret, im_temp1 = cv2.threshold(im_temp, lighter*0.9, 255, cv2.THRESH_BINARY)
        if disp:
            print("thres1")
            self.disp_image(im_temp1, gray=True)
        
        # Compute darker threshold
        darker = np.min(im_temp)
        ret, im_temp2 = cv2.threshold(im_temp, (darker+10)*1.5, 255, cv2.THRESH_BINARY_INV)
        if disp:
            print("thres2")
            self.disp_image(im_temp2, gray=True)
        
        # Merging
        im_temp = cv2.add(im_temp1, im_temp2)
        if disp:
            print("threshold merge")    
            self.disp_image(im_temp, gray=True)
            
        im_temp = cv2.GaussianBlur(im_temp, (9, 9), 0)
        
        return im_temp
    
    
    def otsu(self, im_temp, disp):
        
        ret, im_temp = cv2.threshold(im_temp,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
        if disp:
            self.disp_image(im_temp, gray=True)
        
        return im_temp
    
    def compute_contour(self, im_temp):

        contours, _ = cv2.findContours(im_temp, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        areas = []
        for cnt in contours:
            areas.append(cv2.contourArea(cnt))
        
        # get bigger area and store its contour
        max_area = areas[np.argmax(areas)]
        cnt = contours[np.argmax(areas)]
        return max_area, cnt
        
    
    def get_convex_masked_tail(self, im_path, disp=False):
        """
        Return the image of the tail-convex-crop
        :param im_path: str, path to the image
        :param disp: bool, set to true to display
        """
        # Load image and resize witout changing ratio
        im = cv2.imread(im_path)
        if disp:
            print("Original")
            self.disp_image(im)
        
        height, width = im.shape[:2]
        new_width = 200
        new_height = new_width*height//width
        im = cv2.resize(im, (new_width, new_height), interpolation = cv2.INTER_CUBIC)

        # Change to gray and apply both gaussian and threshold filter
        im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        im_temp = im_gray.copy()
        
        # default preprocessing
        dual_threshold = False
        otsu = True
            
        if otsu:
            im_temp = self.otsu(im_temp, disp)
            max_area, cnt = self.compute_contour(im_temp)
            if max_area <  1000:
                dual_threshold = False
                otsu = False
        
        if dual_threshold:
            im_temp = self.dual_threshold(im_temp, disp)
            max_area, cnt = self.compute_contour(im_temp)
            if max_area < 1000:
                dual_threshold = False
        
        # Check if used preprocessing cropping
        if otsu or dual_threshold:
            # Get convex contour
            cnt_hull = cv2.convexHull(cnt)
            
            crop = np.zeros_like(im)
            for x in range(crop.shape[1]):
                for y in range(crop.shape[0]):
                    if cv2.pointPolygonTest(cnt_hull, (x, y), False) == 1:
                        crop[y, x] = im_gray[y, x]

            if disp:
                print("im crop")
                print(np.max(areas))
                self.disp_image(crop, gray=True)

                print("raw contour")
                im2 = cv2.drawContours(im.copy(), cnt, -1, (255,0,0), 4)
                self.disp_image(im2)

                print("convex contour")
                im3 = cv2.drawContours(im.copy(), cnt_hull, -1, (255,0,0), 10)
                self.disp_image(im3)
        else:
            # If preprocessing not relevant, keep gray image
            crop = im_gray
        
        return crop

# Create the dataset object
dataset = Dataset("dataset")
# Split the dataset into the train_classes folder
dataset.split_in_classes_folders("dataset/train_classes", remove_new_whale=False, remove_old=True, train_dev_ratio=0.2, crop=True, split=False)

# a = dataset.get_convex_masked_tail("dataset/train/5422b5568.jpg", True)
# a = dataset.get_convex_masked_tail("dataset/train/0a0c1df99.jpg", True)
# a = dataset.get_convex_masked_tail("dataset/train/17f3cae5a.jpg", True)
# a = dataset.get_convex_masked_tail("dataset/train/00de0f4c8.jpg", True)
# "dataset/train/0a0c1df99.jpg"
# "dataset/train/0a1a0c3f7.jpg"