# Whales identification from their tails - Kaggle contest

## Dependencies

In [1]:
import keras
import numpy as np
import glob
import pandas as pd
import os
import shutil
import random
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Dataset reading

In [2]:
class Dataset:
    """
    Manage dataset loading
    
    :param dataset_path: str, path to the dataset folder
    """
    
    def __init__(self, dataset_path):
        
        # Build and store dataset paths
        self.dataset_path_train = dataset_path + "/train"
        self.dataset_path_test = dataset_path + "/test"
        
        # Build train.csv path
        self.dataset_path_train_label = dataset_path + "/train.csv"
        
        # Generate pandas dataframe of whales id <-> file matching
        self.dataset_train_label = self.get_train_label()
        
        ## Get pictures paths
#         self.les_im_path_train = self.get_im_path(self.dataset_path_train + "/*.jpg")
#         self.les_im_path_test = self.get_im_path(self.dataset_path_test + "/*.jpg")
        
    def get_im_path(self, dataset_path):
        """
        Get pictures path under the given folder path
        :param dataset_path: str, path to the dataset folder
        
        :output les_im_path: list of string, .jpg picture paths under the dataset_path folder
        """
        print("Getting images path from", dataset_path)
        les_im_path = glob.glob(dataset_path)
        les_im_path.sort()
        return les_im_path
    
    def get_train_label(self):
        """
        Load the train dataset annotation using pandas
        
        :return train_label: pandas dataframe, whales id <-> files matching
        """
        print("Loading", self.dataset_path_train_label)
        return pd.read_csv(self.dataset_path_train_label)
    
    
    def create_folder_is_needed(self, folder_path):
        """
        Create a folder if it doesn't alreadt exist
        :param folder_path: str
        """
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)
    
    def split_in_classes_folders(self, root_classes_folder, pass_new_whale=True, remove_old=True, train_dev_ratio=0.2):
        """
        Split the dataset into classes folders
        :param root_classes_folder:
        :param pass_new_whale: boolean, set to true if not considering new_whale id
        
        OUTPUT:
            pictures sort by whale id into subfolders of the root_classes_folder
        """
        
        if remove_old:
            print("Removing previous spliting")
            shutil.rmtree(root_classes_folder)
        
        sub_dataset_train_path = "/".join([root_classes_folder, "train"])
        sub_dataset_dev_path = "/".join([root_classes_folder, "dev"])
        
        # Create the folders if needed
        self.create_folder_is_needed(root_classes_folder)
        self.create_folder_is_needed(sub_dataset_train_path)
        self.create_folder_is_needed(sub_dataset_dev_path)
        
        # If passing new whales, should remove previous folder
        new_whale_folder_path = "/".join([root_classes_folder, "new_whale"])
        if pass_new_whale and os.path.isdir(new_whale_folder_path):
            print("Removing new_whale folder")
            shutil.rmtree(new_whale_folder_path)
        
        # Sorting the images
        files_number = len(self.dataset_train_label)
        print("Sorting", files_number, "images into", root_classes_folder)
        
        for index, row in self.dataset_train_label.iterrows():
            print("#" + str(index + 1) + "/" + str(files_number), end="\r")
            whale_file_name = row['Image']
            whale_id = row['Id']
            if pass_new_whale and whale_id == "new_whale":
                continue
            
            ## Choose if storing in train of dev dataset
            if len(glob.glob("/".join([sub_dataset_train_path, whale_id]))):
                if random.uniform(0, 1) >= 0.2:
                    # We store in train dataset
                    sub_dataset_path = sub_dataset_train_path
                else:
                    # We store in dev dataset
                    sub_dataset_path = sub_dataset_dev_path
            else:
                sub_dataset_path = sub_dataset_train_path
            
            self.create_folder_is_needed("/".join([sub_dataset_path, whale_id]))
            shutil.copy(
                "/".join([self.dataset_path_train, whale_file_name]), 
                "/".join([sub_dataset_path, whale_id, whale_file_name])
            )
        print("Done" + " "*20)
        
        ## Removing folder that doesn't have a dev equivalent
        for train_path in glob.glob(sub_dataset_train_path + "/*/"):
            in_train_id = train_path.split("/")[-2]
            equivalent_dev_id_path = "/".join([sub_dataset_dev_path, in_train_id])
            if not os.path.isdir(equivalent_dev_id_path):
                shutil.rmtree(train_path)
            
# Create the dataset object
dataset = Dataset("dataset")
# Split the dataset into the train_classes folder
dataset.split_in_classes_folders("dataset/train_classes")

Loading dataset/train.csv


FileNotFoundError: [Errno 2] File b'dataset/train.csv' does not exist: b'dataset/train.csv'

## Deep Learning training

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.utils import plot_model

class Model:
    
    def __init__(self, dataset_path):
        self.img_width = 150
        self.img_height = 150
        self.dataset_path = dataset_path
        self.dataset_path_train = "/".join([dataset_path, "train"])
        self.dataset_path_dev = "/".join([dataset_path, "dev"])
        self.epochs = 50
        self.batch_size = 8
        self.len_train = len(glob.glob(self.dataset_path_train + "/*/*.jpg"))
        self.len_dev = len(glob.glob(self.dataset_path_dev + "/*/*.jpg"))
        self.n_train_label = len(glob.glob(self.dataset_path_train + "/*/"))
        self.n_dev_label = len(glob.glob(self.dataset_path_dev + "/*/"))
    
    def run_training(self):
        
        ## Build model architecture
        if K.image_data_format() == 'channels_first':
            input_shape = (3, self.img_width, self.img_height)
        else:
            input_shape = (self.img_width, self.img_height, 3)
        
        print("input_shape", input_shape)
        print("n train label", self.n_train_label)
        print("n dev label", self.n_dev_label)
        
        self.model = Sequential()
        self.model.add(Conv2D(32, (3, 3), input_shape=input_shape))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(32, (3, 3)))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(64, (3, 3)))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Flatten())
        self.model.add(Dense(self.n_train_label))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Activation('softmax'))

        self.model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])
        
        # plot_model(self.model, to_file='model.png')
        self.model.summary()

        ## Define data generation
        train_datagen = ImageDataGenerator(
            rescale=1. / 255,
            horizontal_flip=True)

        test_datagen = ImageDataGenerator(rescale=1. / 255)

        train_generator = train_datagen.flow_from_directory(
            self.dataset_path_train,
            target_size=(self.img_width, self.img_height),
            batch_size=self.batch_size,
            class_mode='categorical')

        validation_generator = test_datagen.flow_from_directory(
            self.dataset_path_dev,
            target_size=(self.img_width, self.img_height),
            batch_size=self.batch_size,
            class_mode='categorical')
    
        
        self.history = self.model.fit_generator(
            train_generator,
            epochs=self.epochs,
            steps_per_epoch=self.len_train//self.batch_size,
            validation_data=validation_generator, 
            validation_steps=self.len_dev//self.batch_size)
        

        self.model.save_weights('first_try.h5')

    def plot_model(self):
        # summarize history for accuracy
        plt.plot(self.history.history['acc'])
        plt.plot(self.history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        
        # summarize history for loss
        plt.plot(self.history.history['loss'])
        plt.plot(self.history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        
modeler = Model("dataset/train_classes")
modeler.run_training()

input_shape (150, 150, 3)
n train label 695
n dev label 695
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 148, 148, 32)      896       
_________________________________________________________________
activation_26 (Activation)   (None, 148, 148, 32)      0         
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 72, 72, 32)        9248      
_________________________________________________________________
activation_27 (Activation)   (None, 72, 72, 32)        0         
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_18 (Conv2

In [None]:
modeler.model