# Whales identification from their tails - Kaggle contest

## Dependencies

In [43]:
import keras
import numpy as np
import glob
import pandas as pd
import os
import shutil

## Dataset reading

In [None]:
class Dataset:
    """
    Manage dataset loading
    
    :param dataset_path: str, path to the dataset folder
    """
    
    def __init__(self, dataset_path):
        
        # Build and store dataset paths
        self.dataset_path_train = dataset_path + "/train"
        self.dataset_path_test = dataset_path + "/test"
        
        # Build train.csv path
        self.dataset_path_train_label = dataset_path + "/train.csv"
        
        # Generate pandas dataframe of whales id <-> file matching
        self.dataset_train_label = self.get_train_label()
        
        ## Get pictures paths
#         self.les_im_path_train = self.get_im_path(self.dataset_path_train + "/*.jpg")
#         self.les_im_path_test = self.get_im_path(self.dataset_path_test + "/*.jpg")
        
    def get_im_path(self, dataset_path):
        """
        Get pictures path under the given folder path
        :param dataset_path: str, path to the dataset folder
        
        :output les_im_path: list of string, .jpg picture paths under the dataset_path folder
        """
        print("Getting images path from", dataset_path)
        les_im_path = glob.glob(dataset_path)
        les_im_path.sort()
        return les_im_path
    
    def get_train_label(self):
        """
        Load the train dataset annotation using pandas
        
        :return train_label: pandas dataframe, whales id <-> files matching
        """
        print("Loading", self.dataset_path_train_label)
        return pd.read_csv(self.dataset_path_train_label)
    
    
    def create_folder_is_needed(self, folder_path):
        """
        Create a folder if it doesn't alreadt exist
        :param folder_path: str
        """
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)
    
    def split_in_classes_folders(self, root_classes_folder, pass_new_whale=True):
        """
        Split the dataset into classes folders
        :param root_classes_folder:
        :param pass_new_whale: boolean, set to true if not considering new_whale id
        
        OUTPUT:
            pictures sort by whale id into subfolders of the root_classes_folder
        """
        # Create the root folder if needed
        self.create_folder_is_needed(root_classes_folder)
        
        # If passing new whales, should remove previous folder
        new_whale_folder_path = "/".join([root_classes_folder, "new_whale"])
        if pass_new_whale and os.path.isdir(new_whale_folder_path):
            print("Removing new_whale folder")
            shutil.rmtree(new_whale_folder_path)
        
        files_number = len(self.dataset_train_label)
        
        # Sorting the images
        for index, row in self.dataset_train_label.iterrows():
            print("#" + str(index + 1) + "/" + str(files_number), end="\r")
            whale_file_name = row['Image']
            whale_id = row['Id']
            if pass_new_whale and whale_id == "new_whale":
                continue
            
            self.create_folder_is_needed("/".join([root_classes_folder, whale_id]))
            shutil.copy(
                "/".join([self.dataset_path_train, whale_file_name]), 
                "/".join([root_classes_folder, whale_id, whale_file_name])
            )
        print(" - Done")
            
            
# Create the dataset object
dataset = Dataset("dataset")
# Split the dataset into the train_classes folder
dataset.split_in_classes_folders("dataset/train_classes")

Loading dataset/train.csv
#7869/9850

[]


<__main__.Dataset at 0x7efe1686cf28>