In [10]:
import librosa 
import librosa.display
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from tensorflow import keras 
from tensorflow.keras.preprocessing import image_dataset_from_directory, image
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.applications import VGG19, ResNet50
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D, InputLayer, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
path = Path.cwd()/'data'
new_path=Path.cwd()/'data_split'
answers = pd.read_csv('answers.csv')

In [5]:
whales = answers[answers['label']==1]
no_whales= answers[answers['label']==0]
no_whales_x_train, no_whales_x_val, no_whales_y_train, no_whales_y_val = train_test_split( no_whales[['image_path2']], no_whales.label, test_size=0.2, random_state=10)
whales_x_train, whales_x_val, whales_y_train, whales_y_val = train_test_split( whales[['image_path2','name']], whales.label, test_size=0.2, random_state=10)

In [263]:
# import subprocess

# for i, row in whales_x_train.iterrows():
#     subprocess.call(["cp", row[0], new_path/'training'/'whale' ])
    
# for i, row in whales_x_val.iterrows():
#     subprocess.call(["cp", row[0], new_path/'validation'/'whale'])    


In [265]:
# for i, row in no_whales_x_train.iterrows():
#     subprocess.call(["cp", row[0], new_path/'training'/'no_whale'])

In [266]:
# for i, row in no_whales_x_val.iterrows():
#     subprocess.call(["cp", row[0], new_path/'validation'/'no_whale'])

In [29]:
datagen = ImageDataGenerator()
images_directory = new_path
cb_training = datagen.flow_from_directory(images_directory/'training', class_mode='binary')
cb_validation = datagen.flow_from_directory(images_directory/'validation', class_mode='binary')


Found 23194 images belonging to 2 classes.
Found 3047 images belonging to 2 classes.


In [30]:
def import_data():
    datagen = ImageDataGenerator()
    new_path=Path.cwd()/'data_split'
    images_directory = new_path
    cb_training = datagen.flow_from_directory(images_directory/'training', class_mode='binary')
    cb_validation = datagen.flow_from_directory(images_directory/'validation', class_mode='binary')
    no_whales_x_train, no_whales_x_val, no_whales_y_train, no_whales_y_val = train_test_split( no_whales[['image_path2']], no_whales.label, test_size=0.2, random_state=10)
    whales_x_train, whales_x_val, whales_y_train, whales_y_val = train_test_split( whales[['image_path2','name']], whales.label, test_size=0.2, random_state=10)
    return (cb_training, whales_y_train, cb_validation, whales_y_val)


def base_set_up(filename, patience=2):
    
    xx =compute_class_weight(class_weight='balanced',classes=np.unique(answers.label), y=answers.label)
    class_weight = dict(zip(np.unique(answers.label), xx))
    METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
    ]

    earlystop = EarlyStopping(monitor='val_accuracy',verbose=1, patience=patience)
    checkpoint = ModelCheckpoint(filename, monitor='val_accuracy', save_best_only=True,save_weights_only=False, verbose=1)
    return (class_weight, METRICS, earlystop, checkpoint)

# CNN lets overtrain 

In [None]:
cnn = None
keras.backend.clear_session()
filename = 'cnn_overfit2.h5'

cb_training, training_labels, cb_validation, val_labels  = import_data()
class_weight, METRICS, earlystop, checkpoint = base_set_up(filename, 10)

cnn = Sequential()
cnn.add(InputLayer(input_shape=(97,97, 3)))
cnn.add(Conv2D(filters=5, kernel_size=3, activation='relu', padding='same'))
cnn.add(MaxPooling2D())
cnn.add(Conv2D(filters=10, kernel_size=3, activation='relu', padding='same'))
cnn.add(MaxPooling2D())
cnn.add(Conv2D(filters=20, kernel_size=3, activation='relu', padding='same'))
cnn.add(MaxPooling2D())
cnn.add(Conv2D(filters=30, kernel_size=3, activation='relu', padding='same'))
cnn.add(MaxPooling2D())
cnn.add(Conv2D(filters=40, kernel_size=3, activation='relu', padding='same'))
cnn.add(MaxPooling2D())
cnn.add(Conv2D(filters=50, kernel_size=3, activation='relu', padding='same'))
cnn.add(GlobalAveragePooling2D())

cnn.add(layers.Dense(100, activation='relu'))
cnn.add(layers.Dense(100, activation='relu'))
cnn.add(layers.Dense(100, activation='relu'))


cnn.add(layers.Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=0.00001)

cnn.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=METRICS)

history = cnn.fit(cb_training, epochs=40,validation_data=cb_validation, class_weight = class_weight, callbacks=[earlystop, checkpoint])


Found 23194 images belonging to 2 classes.
Found 3047 images belonging to 2 classes.
Epoch 1/40

Epoch 00001: val_accuracy improved from -inf to 0.81227, saving model to cnn_overfit2.h5
Epoch 2/40

Epoch 00002: val_accuracy did not improve from 0.81227
Epoch 3/40

Epoch 00003: val_accuracy did not improve from 0.81227
Epoch 4/40

Epoch 00004: val_accuracy improved from 0.81227 to 0.82803, saving model to cnn_overfit2.h5
Epoch 5/40

Epoch 00005: val_accuracy did not improve from 0.82803
Epoch 6/40

Epoch 00006: val_accuracy improved from 0.82803 to 0.86019, saving model to cnn_overfit2.h5
Epoch 7/40

Epoch 00007: val_accuracy did not improve from 0.86019
Epoch 8/40

Epoch 00008: val_accuracy did not improve from 0.86019
Epoch 9/40

Epoch 00009: val_accuracy did not improve from 0.86019
Epoch 10/40

Epoch 00010: val_accuracy did not improve from 0.86019
Epoch 11/40

Epoch 00011: val_accuracy did not improve from 0.86019
Epoch 12/40

Epoch 00012: val_accuracy improved from 0.86019 to 0.87


Epoch 00017: val_accuracy did not improve from 0.89038
Epoch 18/40

Epoch 00018: val_accuracy did not improve from 0.89038
Epoch 19/40

Epoch 00019: val_accuracy did not improve from 0.89038
Epoch 20/40

Epoch 00020: val_accuracy did not improve from 0.89038
Epoch 21/40

Epoch 00021: val_accuracy did not improve from 0.89038
Epoch 22/40

Epoch 00022: val_accuracy did not improve from 0.89038
Epoch 23/40

Epoch 00023: val_accuracy did not improve from 0.89038
Epoch 24/40

In [None]:
hist = history.history
all_keys = list(hist.keys())

fig, axes = plt.subplots(5,2, figsize=(15,15))
for i,ax in enumerate(axes.flatten()):
        key = all_keys[i]
        val_key = 'val_'+key
        ax.plot(hist[key], label=key)
        ax.plot(hist[val_key], label = val_key)
        plt.xlabel('Epoch')
        plt.ylabel(key)
        ax.legend(loc='lower right')

In [None]:
tf.math.confusion_matrix(cnn.predict(cb_validation), cb_validation.classes)


In [171]:
cnn.evaluate(cb_validation,verbose=2)


188/188 - 4s - loss: 0.1867 - tp: 1250.0000 - fp: 327.0000 - tn: 4274.0000 - fn: 149.0000 - accuracy: 0.9207 - precision: 0.7926 - recall: 0.8935 - auc: 0.9712 - prc: 0.9094


[0.18670053780078888,
 1250.0,
 327.0,
 4274.0,
 149.0,
 0.9206666946411133,
 0.7926442623138428,
 0.8934953808784485,
 0.971164882183075,
 0.9094194173812866]