### This nb creates a subset of the Happywhale training images and gets them all set up for use with `ImageDataGenerator`

If we decide to use the full set, it will be easy enough to adapt this code to structure all images for the data generator

In [59]:
import os
import random
import pandas as pd
import shutil
import numpy as np
import tensorflow as tf
import keras

redo = False

In [50]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB == True:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


#### Creating file structure

In [36]:
if IN_COLAB == True:
  wd = '/content/drive/Shareddrives/Whales-ML/'
  df = pd.read_csv(wd + 'train.csv')
else:
  df = pd.read_csv('data/train.csv')

# correcting mispelled species
df['species'] = df['species'].replace('kiler_whale', 'killer_whale')
df['species'] = df['species'].replace('bottlenose_dolpin', 'bottlenose_dolphin')
image_names = df['image']
sp_names = pd.unique(df['species'])
sp_names

array(['melon_headed_whale', 'humpback_whale', 'false_killer_whale',
       'bottlenose_dolphin', 'beluga', 'minke_whale', 'fin_whale',
       'blue_whale', 'gray_whale', 'southern_right_whale',
       'common_dolphin', 'killer_whale', 'pilot_whale', 'dusky_dolphin',
       'long_finned_pilot_whale', 'sei_whale', 'spinner_dolphin',
       'cuviers_beaked_whale', 'spotted_dolphin', 'globis',
       'brydes_whale', 'commersons_dolphin', 'white_sided_dolphin',
       'short_finned_pilot_whale', 'rough_toothed_dolphin',
       'pantropic_spotted_dolphin', 'pygmy_killer_whale',
       'frasiers_dolphin'], dtype=object)

In [None]:
if IN_COLAB == True:
  train_subset_dir = '/content/drive/Shareddrives/Whales-ML/subset/train_subset/'
  validation_subset_dir = '/content/drive/Shareddrives/Whales-ML/subset/validation_subset/'
else:
  train_subset_dir = 'data/subset/train_subset/'
  validation_subset_dir = 'data/subset/validation_subset/'

In [45]:
if redo == True:

    os.mkdir(train_subset_dir)
    os.mkdir(validation_subset_dir)

    for i in range(0, len(sp_names)):
        os.mkdir(train_subset_dir + sp_names[i])
        os.mkdir(validation_subset_dir + sp_names[i])

#### Copying files into new directory structure

In [42]:
if redo == True:

    train_dir = 'data/train_images/'
    train_nsamples = len(os.listdir(train_dir))

    # randomly selecting 4000 training images and 1000 validation images
    seed_value = 71993
    random.seed(seed_value)
    sample_indices = random.sample(range(0, train_nsamples), 5000)
    train_indices = sample_indices[0:4000]
    validation_indices = sample_indices[4000:5000]

    train_subset_df = df.filter(items = train_indices, axis=0)
    validation_subset_df = df.filter(items = validation_indices, axis=0)
    
    for sp in sp_names:
        sp_df = train_subset_df[train_subset_df['species']==sp]

        for i in range(0, len(sp_df)):
            src = train_dir + sp_df['image'].iloc[i]
            dst = train_subset_dir + sp + '/' + sp_df['image'].iloc[i]
            shutil.copyfile(src, dst)

        sp_df = validation_subset_df[validation_subset_df['species']==sp]

        for i in range(0, len(sp_df)):
            src = train_dir + sp_df['image'].iloc[i]
            dst = validation_subset_dir + sp + '/' + sp_df['image'].iloc[i]
            shutil.copyfile(src, dst)


#### Looking at the representation of each species in the subset

In [46]:
for sp in sp_names:
    sp_dir = train_subset_dir + sp
    nsamples = len(os.listdir(sp_dir))
    print(f'{sp}: {nsamples}')

melon_headed_whale: 125
humpback_whale: 590
false_killer_whale: 246
bottlenose_dolphin: 738
beluga: 606
minke_whale: 121
fin_whale: 115
blue_whale: 357
gray_whale: 79
southern_right_whale: 73
common_dolphin: 28
killer_whale: 127
pilot_whale: 18
dusky_dolphin: 256
long_finned_pilot_whale: 15
sei_whale: 37
spinner_dolphin: 135
cuviers_beaked_whale: 28
spotted_dolphin: 35
globis: 8
brydes_whale: 9
commersons_dolphin: 12
white_sided_dolphin: 16
short_finned_pilot_whale: 30
rough_toothed_dolphin: 5
pantropic_spotted_dolphin: 8
pygmy_killer_whale: 5
frasiers_dolphin: 3


## Data generator

In [None]:
def make_path(dataset, file_name):
  path = locals()[dataset + '_subset_dir'] + '/' + file_name
  return path

In [62]:
from keras.preprocessing.image import ImageDataGenerator

batch_size = 50
train_steps = 4000 / batch_size
validation_steps = 1000 / batch_size

train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        train_subset_dir,
        target_size=(150, 150),
        batch_size=batch_size,
        class_mode='binary')

validation_generator = validation_datagen.flow_from_directory(
        validation_subset_dir,
        target_size=(150, 150),
        batch_size=batch_size,
        class_mode='binary')

Found 4000 images belonging to 28 classes.
Found 1000 images belonging to 28 classes.


In [63]:
for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break

data batch shape: (50, 150, 150, 3)
labels batch shape: (50,)


In [64]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-4),
              metrics=['acc'])

In [None]:
history = model.fit(
      train_generator,
      steps_per_epoch=train_steps,
      epochs=10,
      validation_data=validation_generator,
      validation_steps=validation_steps)