# Whales identification from their tails - Kaggle contest

## Dependencies

In [92]:
import numpy as np
import glob
import pandas as pd
import os
import shutil
import random
import matplotlib.pyplot as plt
import cv2
import matplotlib.pyplot as plt

## Dataset reading

In [115]:
class Dataset:
    """
    Manage dataset loading
    
    :param dataset_path: str, path to the dataset folder
    """
    
    def __init__(self, dataset_path):
        
        # Build and store dataset paths
        self.dataset_path_train = dataset_path + "/train"
        self.dataset_path_test = dataset_path + "/test"
        
        # Build train.csv path
        self.dataset_path_train_label = dataset_path + "/train.csv"
        
        # Generate pandas dataframe of whales id <-> file matching
        self.dataset_train_label = self.get_train_label()
        
        ## Get pictures paths
#         self.les_im_path_train = self.get_im_path(self.dataset_path_train + "/*.jpg")
#         self.les_im_path_test = self.get_im_path(self.dataset_path_test + "/*.jpg")
        
    def get_im_path(self, dataset_path):
        """
        Get pictures path under the given folder path
        :param dataset_path: str, path to the dataset folder
        
        :output les_im_path: list of string, .jpg picture paths under the dataset_path folder
        """
        print("Getting images path from", dataset_path)
        les_im_path = glob.glob(dataset_path)
        les_im_path.sort()
        return les_im_path
    
    def get_train_label(self):
        """
        Load the train dataset annotation using pandas
        
        :return train_label: pandas dataframe, whales id <-> files matching
        """
        print("Loading", self.dataset_path_train_label)
        return pd.read_csv(self.dataset_path_train_label)
    
    
    def create_folder_is_needed(self, folder_path):
        """
        Create a folder if it doesn't alreadt exist
        :param folder_path: str
        """
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)
    
    def split_in_classes_folders(self, root_classes_folder, remove_new_whale=True, remove_old=True, train_dev_ratio=0.2, crop=False):
        """
        Split the dataset into classes folders
        :param root_classes_folder:
        :param remove_new_whale: boolean, set to true if not considering new_whale id
        
        OUTPUT:
            pictures sort by whale id into subfolders of the root_classes_folder
        """
        
        if remove_old and os.path.isdir(root_classes_folder):
            print("Removing previous spliting")
            shutil.rmtree(root_classes_folder)
        
        sub_dataset_train_path = "/".join([root_classes_folder, "train"])
        sub_dataset_dev_path = "/".join([root_classes_folder, "dev"])
        
        # Create the folders if needed
        self.create_folder_is_needed(root_classes_folder)
        self.create_folder_is_needed(sub_dataset_train_path)
        self.create_folder_is_needed(sub_dataset_dev_path)
        
        # If passing new whales, should remove previous folder
        if remove_new_whale and os.path.isdir(root_classes_folder):
            print("Removing new_whale folder")
            train_new_whale = "/".join([sub_dataset_train_path, "new_whale"])
            if os.path.isdir(train_new_whale):
                shutil.rmtree(train_new_whale)
            dev_new_whale = "/".join([sub_dataset_dev_path, "new_whale"])
            if os.path.isdir(dev_new_whale):
                shutil.rmtree(dev_new_whale)
        
        # Sorting the images
        files_number = len(self.dataset_train_label)
        print("Sorting", files_number, "images into", root_classes_folder)
        
        for index, row in self.dataset_train_label.iterrows():
            print("#" + str(index + 1) + "/" + str(files_number), end="\r")
            whale_file_name = row['Image']
            whale_id = row['Id']
            if remove_new_whale and whale_id == "new_whale":
                continue
            
            ## Choose if storing in train of dev dataset
            if len(glob.glob("/".join([sub_dataset_train_path, whale_id]))):
                # If 80% uniform and dev aleady exist
                if random.uniform(0, 1) >= train_dev_ratio and len(glob.glob("/".join([sub_dataset_dev_path, whale_id]))):
                    # We store in train dataset
                    sub_dataset_path = sub_dataset_train_path
                else:
                    # We store in dev dataset
                    sub_dataset_path = sub_dataset_dev_path
            else:
                sub_dataset_path = sub_dataset_train_path
            
            # Create folder and generate path name
            self.create_folder_is_needed("/".join([sub_dataset_path, whale_id]))
            orig_path = "/".join([self.dataset_path_train, whale_file_name])
            target_path = "/".join([sub_dataset_path, whale_id, whale_file_name])
            if crop:
                cv2.imwrite(target_path, self.get_convex_masked_tail(orig_path))
            else:
                shutil.copy(orig_path, target_path)
                
        print("Done" + " "*20)
        
        ## Removing folder that doesn't have a dev equivalent
        for train_path in glob.glob(sub_dataset_train_path + "/*/"):
            in_train_id = train_path.split("/")[-2]
            equivalent_dev_id_path = "/".join([sub_dataset_dev_path, in_train_id])
            if not os.path.isdir(equivalent_dev_id_path):
                shutil.rmtree(train_path)
    
    def disp_image(self, im, gray=False):
        """
        Display an image in a plt plot
        :param im: image, numpy array format, used by cv2
        """
        
        if gray:
            imgplot = plt.imshow(im, cmap='gray')
        else:
            imgplot = plt.imshow(im)
        plt.show()

        
    def get_convex_masked_tail(self, im_path, disp=False):
        """
        Return the image of the tail-convex-crop
        :param im_path: str, path to the image
        :param disp: bool, set to true to display
        """
        
        # Load image and resize witout changing ratio
        im = cv2.imread(im_path)
        height, width = im.shape[:2]
        new_width = 500
        new_height = new_width*height//width
        im = cv2.resize(im,(new_width, new_height), interpolation = cv2.INTER_CUBIC)

        # Change to gray and apply both gaussian and threshold filter
        im_gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
        blurred_im = cv2.GaussianBlur(im_gray, (5, 5), 0)
        aver = np.average(im_gray)
        ret,thresh = cv2.threshold(blurred_im, int(aver*0.8), 255, cv2.THRESH_BINARY_INV)
        blurred_im = cv2.GaussianBlur(thresh, (5, 5), 0)

        # Compute contours
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        areas = []
        for cnt in contours:
            areas.append(cv2.contourArea(cnt))

        # get bigger area and store its contour
        cnt = contours[np.argmax(areas)]

        # Get convex contour
        cnt_hull = cv2.convexHull(cnt)

        # Check if point is in contour. If yes store it in 
        # - - - - -> Y
        #|
        #|
        #v
        #
        #X
        crop = np.zeros_like(im)
        for x in range(crop.shape[1]):
            for y in range(crop.shape[0]):
                if cv2.pointPolygonTest(cnt_hull, (x, y), False) == 1:
                    crop[y, x] = im_gray[y, x]

        if disp:
            print(area)
            self.disp_image(crop, gray=True)
            im2 = cv2.drawContours(im.copy(), cnt_hull, -1, (255,0,0), 4)
            self.disp_image(im2)

        return crop

# Create the dataset object
dataset = Dataset("dataset")
# Split the dataset into the train_classes folder
dataset.split_in_classes_folders("dataset/train_classes", remove_new_whale=False, remove_old=True, train_dev_ratio=0.2, crop=True)

Loading dataset/train.csv
Removing previous spliting
Sorting 25361 images into dataset/train_classes
#133/25361

KeyboardInterrupt: 

## Tails cropping