<h1>Dog Breed Classification</h1>
<a id='Intro'></a>
<h1>Continued</h1>

This is a part two interactive notebook showcasing further modeling on the dog breed classification data. 
<br>
The part one notebook can be found: https://github.com/matthewjchin/dogbreedclassification/blob/main/dog_breed_class.ipynb

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# It would be recommended to run this notebook on Colab or Kaggle

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
# Import basic libraries
import pandas as pd
import numpy as np
import os

# Import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(94)

In [10]:
os.getcwd()

'/kaggle/working'

In [11]:
os.chdir("/kaggle/input/dog-breed-identification")
os.getcwd()

'/kaggle/input/dog-breed-identification'

In [12]:
# The path to the files should not be like this all the time; everything done offline
# Actual project is found on Kaggle, but elected to run on own environment which best fit needs
labels = pd.read_csv("labels.csv")
labels.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [13]:
train_dir = '/kaggle/input/dog-breed-identification/train'

In [14]:
# Add entire folder path to image files and make it a column
labels['path'] = labels['id'].apply(lambda x : train_dir + "/"+ x + '.jpg')
labels.head()

Unnamed: 0,id,breed,path
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull,/kaggle/input/dog-breed-identification/train/0...
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo,/kaggle/input/dog-breed-identification/train/0...
2,001cdf01b096e06d78e9e5112d419397,pekinese,/kaggle/input/dog-breed-identification/train/0...
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick,/kaggle/input/dog-breed-identification/train/0...
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever,/kaggle/input/dog-breed-identification/train/0...


In [15]:
# Import TensorFlow, Keras, Scikit-learn libraries to be used

import tensorflow as tf # Use tf v2.12.0
import keras
from keras import layers, models, callbacks, optimizers
from keras.src.legacy.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

tf.random.set_seed(104)

In [16]:
# Get the TensorFlow Version - we are using v2.12.0
print(tf.__version__, keras.__version__)

2.18.0 3.8.0


In [17]:
# Set up constants for model
RANDOM_STATE = 50
IMG_SIZE = 256
DOG_BREEDS = len(labels['breed'].unique())
LEARNING_RATE = 0.005
BATCH_SIZE = 32

# Epochs value may vary based on number of tests
EPOCHS = 5

In [18]:
# Check if GPU is available - only on Colab or Kaggle

gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print("GPU being used for training")
    for gpu in gpus:
        print(f"GPU Name: {gpu.name}")
else: # Use CPU
    BATCH_SIZE=16
    EPOCHS = 5
    print("Using CPU")


GPU being used for training
GPU Name: /physical_device:GPU:0
GPU Name: /physical_device:GPU:1


In [19]:
# Create a label encoding
encoder = LabelEncoder()
labels['breed_id']= encoder.fit_transform(labels['breed'])
labels.head()


Unnamed: 0,id,breed,path,breed_id
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull,/kaggle/input/dog-breed-identification/train/0...,19
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo,/kaggle/input/dog-breed-identification/train/0...,37
2,001cdf01b096e06d78e9e5112d419397,pekinese,/kaggle/input/dog-breed-identification/train/0...,85
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick,/kaggle/input/dog-breed-identification/train/0...,15
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever,/kaggle/input/dog-breed-identification/train/0...,49


In [20]:
'''Set up variables x, y to load images from train_dir into array/matrix form (seen in last cell)
Adjust size as appropriate and put everything in list form
Create dataset y of dummy variables of dog breeds and then get shapes

x - type numpy array
y - type dataframe
'''


x= np.array([img_to_array(load_img(image, target_size=(IMG_SIZE, IMG_SIZE)))
             for image in labels['path'].values.tolist()])
y = pd.get_dummies(labels['breed'])
print(x.shape, y.shape)

(10222, 256, 256, 3) (10222, 120)


In [22]:
# Do the train-test split
# Implement the training and validation sets for x, y so that train-test size is 80%-20%
 
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)

(8177, 256, 256, 3) (8177, 120)
(2045, 256, 256, 3) (2045, 120)


In [23]:
print(type(x_train), type(x_valid), type(y_train), type(y_valid))
print(x_train.dtype, x_valid.dtype)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
float32 float32


In [None]:
# len(base_model.layers)

In [None]:
# base_model.layers

<a id='Different'></a>
## <h1>Trying a Different Model</h1>

Try another model to see if we can get better predictions.<br>
Use VGG16 deep CNN for image classification, where there are 16 layers of artificial neurons.<br>

First, perform some image augmentation using ImageDataGenerator on the train data.<br>
Also, set up a data generator for validation data.<br>

We will use VGG16 as the architecture to try and get better predictions.

In [21]:

train_data_gen = ImageDataGenerator(
    rescale=1./255., rotation_range=40,
    width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2,
    zoom_range=0.2, horizontal_flip=True
)

valid__data_gen = ImageDataGenerator(rescale=1./ 255.)

In [None]:
from keras import layers
from keras import models

from keras.applications import (ResNet50, ResNet101, 
                                InceptionResNetV2, InceptionV3,
                                EfficientNetB0, 
                                VGG16, VGG19)

img_shape = (IMG_SIZE, IMG_SIZE, 3)
base_model = VGG16(input_shape=img_shape, include_top=False, weights='imagenet')

for layer in base_model.layers: # Not needed to train all 16 layers
    layer.trainable = False


In [None]:
base_model.summary()

In [None]:
last_layer = base_model.get_layer('block5_pool')

In [None]:
last_layer_output = last_layer.output
last_layer_output.name

'keras_tensor_239'

In [None]:
last_layer_output.shape

(None, 8, 8, 512)

In [None]:
from keras import optimizers
# Set up layers for the model, then initialize using base to compile the model

# Get layers to be one single dimension
x = layers.Flatten()(last_layer_output)

# Add a layer with 256 units and ReLU, which would make this layer fully connected
x = layers.Dense(256, activation='relu')(x)

# Normalize the batch
x = layers.BatchNormalization()(x)

# Add another layer with 256 units and ReLU, which would make this layer fully connected
x = layers.Dense(256, activation='relu')(x)

# Make a sigmoid layer with sigmoid
x = layers.Dense(1, activation='sigmoid')(x)

# Set dropout rate to 0.6
x = layers.Dropout(0.6)(x)

# Normalize the batch again
x = layers.BatchNormalization()(x)

# Initialize the output variable using dense softmax layer
output = layers.Dense(120, activation='softmax')(x)

model = models.Model(base_model.input, output)
# model.compile(optimizer=optimizers.RMSprop(learning_rate=0.0005), 
#               loss='binary_crossentropy', metrics=['acc'])

model.compile(
	optimizer='adam',
	loss='categorical_crossentropy',
	metrics=[tf.keras.metrics.AUC()]
)



In [None]:
# history = model.fit(train_ds, validation_data=val_ds,
#                     epochs=10, verbose=1,
#                    )

# from keras import losses

# # Model Architecture
# x = layers.Flatten()(last_layer_output)
# x = layers.Dense(256, activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.Dense(256, activation='relu')(x)
# x = layers.Dropout(0.3)(x)
# x = layers.BatchNormalization()(x)
# output = layers.Dense(120, activation='softmax')(x)

# model = models.Model(base_model.input, output)

# # Model Compilation
# model.compile(
# 	optimizer='adam',
# 	loss=losses.CategoricalCrossentropy(from_logits=True),
# 	metrics=[tf.keras.metrics.AUC()]
# )

In [1]:
# class myCallback(tf.keras.callbacks.Callback):
# 	def on_epoch_end(self, epoch, logs={}):
# 		if logs.get('val_auc') is not None and logs.get('val_auc') > 0.99:
# 			print('\n Validation accuracy has reached upto 90% so, stopping further training.')
# 			self.model.stop_training = True

# es = EarlyStopping(patience=3,
#                 monitor='val_auc',
#                 restore_best_weights=True,
#                 mode='max')

# lr = ReduceLROnPlateau(monitor='val_loss',
# 					patience=2,
# 					factor=0.5,
# 					verbose=1)

In [None]:
# base_model = InceptionV3(
#     input_shape=(128, 128, 3),
#     weights='imagenet',
#     include_top=False
# )

In [None]:
# # Create data generators for ImageDataGenerators
# train_gen = train_gen.flow_from_dataframe(train_set, train_dir, 'id', 'breed', target_size=(IMG_SIZE, IMG_SIZE),
#                                           batch_size=BATCH_SIZE, class_mode='categorical')
# test_gen = test_gen.flow_from_dataframe(test_set, train_dir, 'id', 'breed', target_size=(IMG_SIZE, IMG_SIZE),
#                                           batch_size=BATCH_SIZE, class_mode='categorical')
# valid_gen = valid_gen.flow_from_dataframe(valid_set, val_dir, 'id', 'breed', target_size=(IMG_SIZE, IMG_SIZE),
#                                           batch_size=BATCH_SIZE, class_mode='categorical')
