# Whales identification from their tails - Kaggle contest

## Dependencies

In [1]:
import keras
import numpy as np
import glob
import pandas as pd
import os
import shutil
import random
import matplotlib.pyplot as plt

Using TensorFlow backend.


## Dataset reading

In [19]:
class Dataset:
    """
    Manage dataset loading
    
    :param dataset_path: str, path to the dataset folder
    """
    
    def __init__(self, dataset_path):
        
        # Build and store dataset paths
        self.dataset_path_train = dataset_path + "/train"
        self.dataset_path_test = dataset_path + "/test"
        
        # Build train.csv path
        self.dataset_path_train_label = dataset_path + "/train.csv"
        
        # Generate pandas dataframe of whales id <-> file matching
        self.dataset_train_label = self.get_train_label()
        
        ## Get pictures paths
#         self.les_im_path_train = self.get_im_path(self.dataset_path_train + "/*.jpg")
#         self.les_im_path_test = self.get_im_path(self.dataset_path_test + "/*.jpg")
        
    def get_im_path(self, dataset_path):
        """
        Get pictures path under the given folder path
        :param dataset_path: str, path to the dataset folder
        
        :output les_im_path: list of string, .jpg picture paths under the dataset_path folder
        """
        print("Getting images path from", dataset_path)
        les_im_path = glob.glob(dataset_path)
        les_im_path.sort()
        return les_im_path
    
    def get_train_label(self):
        """
        Load the train dataset annotation using pandas
        
        :return train_label: pandas dataframe, whales id <-> files matching
        """
        print("Loading", self.dataset_path_train_label)
        return pd.read_csv(self.dataset_path_train_label)
    
    
    def create_folder_is_needed(self, folder_path):
        """
        Create a folder if it doesn't alreadt exist
        :param folder_path: str
        """
        if not os.path.isdir(folder_path):
            os.mkdir(folder_path)
    
    def split_in_classes_folders(self, root_classes_folder, pass_new_whale=True, remove_old=True, train_dev_ratio=0.2):
        """
        Split the dataset into classes folders
        :param root_classes_folder:
        :param pass_new_whale: boolean, set to true if not considering new_whale id
        
        OUTPUT:
            pictures sort by whale id into subfolders of the root_classes_folder
        """
        
        if remove_old:
            print("Removing previous spliting")
            shutil.rmtree(root_classes_folder)
        
        sub_dataset_train_path = "/".join([root_classes_folder, "train"])
        sub_dataset_dev_path = "/".join([root_classes_folder, "dev"])
        
        # Create the folders if needed
        self.create_folder_is_needed(root_classes_folder)
        self.create_folder_is_needed(sub_dataset_train_path)
        self.create_folder_is_needed(sub_dataset_dev_path)
        
        # If passing new whales, should remove previous folder
        new_whale_folder_path = "/".join([root_classes_folder, "new_whale"])
        if pass_new_whale and os.path.isdir(new_whale_folder_path):
            print("Removing new_whale folder")
            shutil.rmtree(new_whale_folder_path)
        
        # Sorting the images
        files_number = len(self.dataset_train_label)
        print("Sorting", files_number, "images into", root_classes_folder)
        
        for index, row in self.dataset_train_label.iterrows():
            print("#" + str(index + 1) + "/" + str(files_number), end="\r")
            whale_file_name = row['Image']
            whale_id = row['Id']
            if pass_new_whale and whale_id == "new_whale":
                continue
            
            ## Choose if storing in train of dev dataset
            if len(glob.glob("/".join([sub_dataset_train_path, whale_id]))):
                if random.uniform(0, 1) >= 0.2:
                    # We store in train dataset
                    sub_dataset_path = sub_dataset_train_path
                else:
                    # We store in dev dataset
                    sub_dataset_path = sub_dataset_dev_path
            else:
                sub_dataset_path = sub_dataset_train_path
            
            self.create_folder_is_needed("/".join([sub_dataset_path, whale_id]))
            shutil.copy(
                "/".join([self.dataset_path_train, whale_file_name]), 
                "/".join([sub_dataset_path, whale_id, whale_file_name])
            )
        print("Done" + " "*20)
        
        ## Removing folder that doesn't have a dev equivalent
        for train_path in glob.glob(sub_dataset_train_path + "/*/"):
            in_train_id = train_path.split("/")[-2]
            equivalent_dev_id_path = "/".join([sub_dataset_dev_path, in_train_id])
            if not os.path.isdir(equivalent_dev_id_path):
                shutil.rmtree(train_path)
            
# Create the dataset object
dataset = Dataset("dataset")
# Split the dataset into the train_classes folder
dataset.split_in_classes_folders("dataset/train_classes")

Loading dataset/train.csv
Removing previous spliting
Sorting 9850 images into dataset/train_classes
Done                    
w_e41f2fa
w_e0cb9c5
w_11f6df1
w_c2c4f43
w_94b4478
w_a8990f1
w_63fc906
w_92d55a6
w_8cc9b05
w_cc699e6
w_c10ffe9
w_cf669dc
w_f6c23d5
w_c30959a
w_7e5cc5e
w_9dfbd27
w_7377b2b
w_d9aab0a
w_428d61d
w_9729e1e
w_771136b
w_c57623d
w_0caa554
w_6a3ca27
w_3fe1eb9
w_a91600a
w_1e4c0ec
w_0819271
w_b856fc1
w_5dab8df
w_1c432e7
w_1fd0d0e
w_2c68b75
w_1beadba
w_3572e7e
w_3c3267c
w_ebf9290
w_4e68ddc
w_2b1e2f5
w_57a137f
w_2ffed9c
w_0e30df6
w_d915632
w_cb7d1b5
w_900fab7
w_1743d93
w_77e1ae3
w_f054e7a
w_acc67ea
w_d6e5334
w_dbb786d
w_7c44934
w_19ca2c6
w_d465e44
w_9dcf002
w_f60c7e8
w_099ab25
w_45b90d9
w_8fab53d
w_ba2f2c4
w_5f0a14c
w_8d5ede1
w_beb62cc
w_479cb36
w_997582c
w_c07f119
w_bc93297
w_5d0a13f
w_6e7b28b
w_e7e1bd1
w_5c13154
w_26bd720
w_dd2bd69
w_d3ed80e
w_2392b4c
w_578a509
w_469f571
w_6affb63
w_4fcfa4d
w_463b450
w_14964c1
w_b7f2cde
w_a35191f
w_4b7b80b
w_70a31ce
w_6dc7db6
w_e44d512
w_941

w_735675e
w_d9f1ea6
w_24ad46c
w_3664a22
w_392bee3
w_de20ab0
w_06a6351
w_5b1fa1d
w_d89b29e
w_2c717fb
w_09c1e0b
w_c90feaa
w_984a3fa
w_daeb296
w_eec1133
w_da0372d
w_1da7080
w_c6b5519
w_43340c5
w_656afeb
w_797e546
w_cd65880
w_7538922
w_57186e5
w_b08520c
w_04c841c
w_ec67c71
w_8e4fa60
w_87c4190
w_494dd45
w_b318111
w_5470134
w_fa69bb8
w_5f82501
w_7c9fd46
w_efebfe8
w_1a229eb
w_85cdd9b
w_bb388ca
w_2d25d1f
w_6485cd4
w_8a3449f
w_616ca36
w_081dd6e
w_5e3d9d0
w_ca27f31
w_0ba62fd
w_f3bd33a
w_3320e76
w_a846944
w_17377c9
w_02e5407
w_8eceeb8
w_cb9220b
w_9df0865
w_60c5e7e
w_2a18a44
w_30d8376
w_2cadad2
w_9c3db0a
w_cacfe12
w_293b5e4
w_1717a13
w_a2633d4
w_7459706
w_fbbf84a
w_c2580ef
w_89e159a
w_dba0cd8
w_e9ac2a8
w_dace6e9
w_367b996
w_414a0d7
w_6d90f45
w_fe49bc4
w_fe424c3
w_224000c
w_ebf3f26
w_d224115
w_16ad10e
w_15eae33
w_48f649e
w_4759df3
w_71764b4
w_23cd105
w_4df0b12
w_f852d15
w_384e9ca
w_64830fa
w_ed10a36
w_b71eab2
w_dfd3f5e
w_930bc39
w_0134192
w_598b8ab
w_05b2ddd
w_8c1ec28
w_85b0d73
w_4a074b8
w_248af0b


w_2f8aed7
w_7bcc2d6
w_dc89c4c
w_bfe749f
w_c786765
w_794effc
w_d32adcd
w_9065f2e
w_1f00cb7
w_d249ed6
w_92c6140
w_13249f1
w_e21741a
w_65428d9
w_1000f90
w_6635a97
w_ecbf9fa
w_4c4c001
w_4981cba
w_4848a3c
w_e98432b
w_bcf23a8
w_048f7a9
w_e101deb
w_9cb529f
w_25871da
w_2fe43c7
w_e0e5c9e
w_d09e61a
w_551f5ca
w_3aa2073
w_76387b4
w_a93c8f9
w_71e851f
w_5f8b9b9
w_55132a6
w_7ed55c4
w_5b672ea
w_88e679c
w_4b33077
w_da0f481
w_8327e8d
w_ca813c2
w_dac2f44
w_2dbb0fe
w_bbf58a9
w_8643ba3
w_660195e
w_847b884
w_844f032
w_92ed15f
w_8103039
w_5abb7d5
w_a7ac222
w_3026ce2
w_86cc90f
w_060d2c8
w_f400702
w_3d3c0f9
w_7294a6a
w_b5fc897
w_62c548b
w_3222bdb
w_ac69cf7
w_94da90b
w_993d66c
w_09558d4
w_790c2aa
w_9890432
w_556e001
w_71cc46c
w_944f496
w_d0b3293
w_b75f378
w_4b95330
w_49c9417
w_e9592d4
w_47d2bc6
w_03a2ed7
w_361e290
w_6ba475f
w_82c9c67
w_ee24bac
w_7763134
w_83a922d
w_e0efc4f
w_2e4df76
w_f5eb6c6
w_23d3818
w_12d9132
w_f0b4252
w_4f38350
w_c8126db
w_5438813
w_7f999ff
w_648a9a8
w_69185bb
w_15d1235
w_c9abb54
w_046634b


w_5a29f9d
w_51cdd4b
w_419226b
w_f6722e7
w_1febbf3
w_6b03eb4
w_fdf60bb
w_fcfcf68
w_70db713
w_b624ed1
w_e6ced04
w_64783fe
w_8069cbb
w_ed32db9
w_f0f56dc
w_0ecff13
w_bc76fde
w_684ca15
w_cea99fe
w_ed8a846
w_58087ce
w_99c07e8
w_68fb716
w_528e753
w_fd3e88d
w_136337c
w_fbcb6e4
w_ca0ec7c
w_1f95205
w_f9083fe
w_771374c
w_f4224b9
w_eff7e35
w_65efe4d
w_cce912e
w_73f8bd3
w_cf3a12e
w_540fd73
w_b9e00eb
w_e9f85b7
w_a34c992
w_0e40867
w_94e071d
w_e6ddbd5
w_50e125b
w_1fc14e9
w_4e505cc
w_045d9fc
w_1652da1
w_883557a
w_5c62a56
w_cf7eebd
w_38a3f72
w_434ad6a
w_9a967cb
w_cc5fae7
w_6c23fcc
w_e55a554
w_bbc4e7c
w_ad87135
w_4bf5cf7
w_e0f6444
w_84feae3
w_3a9ee71
w_2270691
w_662a132
w_dab33f6
w_36a853c
w_d35355e
w_c2474a2
w_435ee6d
w_6384242
w_3f0f6ba
w_d8eae88
w_5557280
w_4c8cd1c
w_54c00ad
w_5d0666e
w_68a3df0
w_59461a4
w_53859b2
w_ba0d756
w_0467840
w_5021993
w_556df30
w_851a7f4
w_482d9d6
w_c0323f5
w_8242022
w_9562910
w_c20f21c
w_7240516
w_e69cef0
w_2c1dafa
w_b1a1d43
w_f86488a
w_c66638b
w_0981144
w_37372db
w_f2c9e07


## Deep Learning training

In [13]:
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K
from keras.utils import plot_model

class Model:
    
    def __init__(self, dataset_path):
        self.img_width = 150
        self.img_height = 150
        self.dataset_path = dataset_path
        self.dataset_path_train = "/".join([dataset_path, "train"])
        self.dataset_path_dev = "/".join([dataset_path, "dev"])
        self.epochs = 50
        self.batch_size = 8
        self.len_train = len(glob.glob(self.dataset_path_train + "/*/*.jpg"))
        self.len_dev = len(glob.glob(self.dataset_path_dev + "/*/*.jpg"))
        self.n_train_label = len(glob.glob(self.dataset_path_train + "/*/"))
        self.n_dev_label = len(glob.glob(self.dataset_path_dev + "/*/"))
    
    def run_training(self):
        
        ## Build model architecture
        if K.image_data_format() == 'channels_first':
            input_shape = (3, self.img_width, self.img_height)
        else:
            input_shape = (self.img_width, self.img_height, 3)
        
        print("input_shape", input_shape)
        print("n train label", self.n_train_label)
        print("n dev label", self.n_dev_label)
        
        self.model = Sequential()
        self.model.add(Conv2D(32, (3, 3), input_shape=input_shape))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(32, (3, 3)))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Conv2D(64, (3, 3)))
        self.model.add(Activation('relu'))
        self.model.add(MaxPooling2D(pool_size=(2, 2)))

        self.model.add(Flatten())
        self.model.add(Dense(self.n_train_label))
        self.model.add(Activation('relu'))
        self.model.add(Dropout(0.5))
        self.model.add(Activation('softmax'))

        self.model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])
        
        # plot_model(self.model, to_file='model.png')
        self.model.summary()

        ## Define data generation
        train_datagen = ImageDataGenerator(
            rescale=1. / 255,
            shear_range=0.2,
            zoom_range=0.2,
            horizontal_flip=True)

        test_datagen = ImageDataGenerator(rescale=1. / 255)

        train_generator = train_datagen.flow_from_directory(
            self.dataset_path_train,
            target_size=(self.img_width, self.img_height),
            batch_size=self.batch_size,
            class_mode='categorical')

        validation_generator = test_datagen.flow_from_directory(
            self.dataset_path_dev,
            target_size=(self.img_width, self.img_height),
            batch_size=self.batch_size,
            class_mode='categorical')
    
        
        self.model.fit_generator(
            train_generator,
            epochs=self.epochs,
            steps_per_epoch=self.len_train//self.batch_size,
            validation_data=validation_generator, 
            validation_steps=self.len_dev//self.batch_size)
        

        self.model.save_weights('first_try.h5')

modeler = Model("dataset/train_classes")
modeler.run_training()

input_shape (150, 150, 3)
n train label 4250
n dev label 682
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 148, 148, 32)      896       
_________________________________________________________________
activation_16 (Activation)   (None, 148, 148, 32)      0         
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 72, 72, 32)        9248      
_________________________________________________________________
activation_17 (Activation)   (None, 72, 72, 32)        0         
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_12 (Conv

KeyboardInterrupt: 