### Imports

In [1]:
#General
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import random
import time

#Image loading and processing
import os
from matplotlib import image
from skimage.transform import resize

#Model development
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img, img_to_array


from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input as VGG16_preprocess_input
from tensorflow.keras.applications.vgg16 import decode_predictions as VGG16_decode_predictions
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input as RN50_preprocess_input
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.optimizers import SGD

from sklearn.svm import SVC

#Model evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import itertools
import seaborn as sns

### Define functions

In [2]:
def generate_image_set(breed_list, clean='y', threshold = 0, pic_size = (224,224,3)):

#Set up list of dog pics:
    path = 'dog_pics/'
    clean_path = 'clean_dog_pics/'
    file_list = sorted(os.listdir('dog_pics/'))
    file_list = file_list[1:]
    clean_file_list = sorted(os.listdir('clean_dog_pics/'))
    clean_file_list = clean_file_list[1:]

#Load pictures and labels from a specified breed list
    print('Loading pictures...')
    
    labels=[]
    images=list()
    for file in tqdm_notebook(clean_file_list):
        dog_breed=file[6:-7]
        pic_pc = int(file[:-4].split('_')[-1])
        if dog_breed in breed_list:
            if pic_pc >= threshold:
                if clean == 'y':
                    image = load_img(clean_path+file, target_size=pic_size)
                else:
                    load_filename = file[:-7]+'.jpg'
                    image = load_img(path+load_filename, target_size=pic_size)
                image = img_to_array(image)
                images.append(image)
                labels.append(dog_breed)                
    print('Setting up numpy array')
    images = np.array(images)
    
    print('Number of loaded images: ',len(images))
    print('Number of labels: ', len(labels))

#Set up label dictionaries and convert labels from text to integers for modelling
    breed_dict = {}
    for i in range (0, len(breed_list)):
        breed_dict[breed_list[i]]=i
    print(breed_dict)
    inv_breed_dict = {}
    for i in range (0, len(breed_list)):
        inv_breed_dict[i]=breed_list[i]
    print(inv_breed_dict)
    int_labels = [breed_dict[breed] for breed in labels]


#Set up train test split
    print('Setting up train/test split')
    train_images, test_images, train_labels, test_labels =\
    train_test_split(images, int_labels, test_size = 0.2, 
                     random_state = 1, shuffle = True, stratify = int_labels)
    

#Pre-process pictures for modelling
#    print('Pre-processing pictures')
#    train_images, test_images = train_images / 255.0, test_images / 255.0

#Reminder of baseline (updated to reflect number of loaded images):
    baseline_scores=[]
    for label in list(set(test_labels)):
        number_of_images = test_labels.count(label)
        pc_of_images = number_of_images / len(test_labels)
        print(inv_breed_dict[label],'; number of images: ',
              number_of_images, '; % of images', pc_of_images)
        baseline_scores.append(pc_of_images)
    baseline = max(baseline_scores)
    print('Baseline: ', baseline)
    
    return train_images, test_images, train_labels, test_labels, breed_dict, inv_breed_dict, baseline

In [3]:
#Define functions for reviewing results
def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])

    plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'green'
    else:
        color = 'red'

    plt.xlabel("{} {:2.0f}% ({})".format(inv_breed_dict[predicted_label],
                                100*np.max(predictions_array),
                                inv_breed_dict[true_label]),
                                color=color)

def plot_value_array(i, predictions_array, true_label):
    predictions_array, true_label = predictions_array, true_label[i]
    plt.grid(False)
    plt.xticks(range(len(predictions_array)))
    plt.yticks([])
    thisplot = plt.bar(range(len(predictions_array)), predictions_array, color="#777777")
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        thisplot[predicted_label].set_color('green')
        thisplot[true_label].set_color('green')
    if predicted_label != true_label:
        thisplot[predicted_label].set_color('red')
        thisplot[true_label].set_color('pink')

In [4]:
def review_results(predictions, test_labels, test_images, sample_size = 10):
    
    for counter in range(sample_size):
        i = random.randint(0,len(test_labels)-1)
        print(i)
        print(inv_breed_dict)
        plt.figure(figsize=(6,3))
        plt.subplot(1,2,1)
        plot_image(i, predictions[i], test_labels, test_images)
        plt.subplot(1,2,2)
        plot_value_array(i, predictions[i],  test_labels)
        plt.show()

In [5]:
def summarize_diagnostics(history):
    fig, ax = plt.subplots(nrows=2)
    # plot loss
    ax[0].plot(history.history['loss'], color='blue', label='train')
    ax[0].plot(history.history['val_loss'], color='orange', label='test')
    ax[0].set_title('Loss')
    # plot accuracy
    ax[1].plot(history.history['acc'], color='blue', label='train')
    ax[1].plot(history.history['val_acc'], color='orange', label='test')
    ax[1].set_title('Acc')
    fig.tight_layout()
    plt.show(); 

In [6]:
def model_documentation(model_summary,
                        model, breed_list, training_sample_size, 
                        pic_resolution, pic_masked, pic_quality,
                        baseline_score, validation_score, 
                        train_score, test_score,
                        time_taken, notes):
    num_dog_breeds = len(breed_list)
    model_summary = model_summary.append({
        'model_type' : type(model),
        'model_parameters' : model.get_params(),
        'number_dog_breeds' : num_dog_breeds,
        'dog_breeds' : breed_list,
        'training_sample_size' : training_sample_size,
        'picture_resolution' : pic_resolution,
        'pictures_masked' : pic_masked,
        'picture_quality' : pic_quality,
        'baseline_score' : baseline_score,
        'validation_score' : validation_score,
        'train_score' : train_score,
        'test_score' : test_score,
        'time_taken' : time_taken,
        'notes' : notes},
        ignore_index = True)
#Display latest entry
    last_row = model_summary.shape[0]-1
    print(model_summary.iloc[last_row])
    return model_summary

In [7]:
def model_documentation_nn(model_summary,
                           model, breed_list, training_sample_size,
                           pic_resolution, pic_masked, pic_quality,
                           baseline_score, validation_score,
                           train_score, test_score,
                           time_taken, notes):
    num_dog_breeds = len(breed_list)
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    short_model_summary = "\n".join(stringlist)
    model_summary = model_summary.append({
        'model_type' : type(model),
        'model_parameters' : short_model_summary,
        'number_dog_breeds' : num_dog_breeds,
        'dog_breeds' : breed_list,
        'training_sample_size' : training_sample_size,
        'picture_resolution' : pic_resolution,
        'pictures_masked' : pic_masked,
        'picture_quality' : pic_quality,
        'baseline_score' : baseline_score,
        'validation_score' : validation_score,
        'train_score' : train_score,
        'test_score' : test_score,
        'time_taken' : time_taken,
        'notes' : notes},
        ignore_index = True)
#Display latest entry
    last_row = model_summary.shape[0]-1
    print(model_summary.iloc[last_row])
    return model_summary

In [8]:
def modelling_process(model, train_images, train_labels, test_images, test_labels):
    start_time = time.time()
    cv_score = cross_val_score(model, train_images, train_labels, cv=5, n_jobs=-1)
    print('CV score: ', cv_score)
    mean_cv_score = cv_score.mean()
    print('Mean cv score: ', mean_cv_score)
    model.fit(train_images, train_labels)
    train_score = model.score(train_images, train_labels)
    print('Train score: ', train_score)
    test_score = model.score(test_images, test_labels)
    print('Test score: ', test_score)
    model_time = time.time() - start_time
    return model, mean_cv_score, train_score, test_score, model_time

In [9]:
def modelling_process_GS(model, train_images, train_labels, test_images, test_labels):
    start_time = time.time()
    model.fit(train_images, train_labels)
    mean_cv_score = model.cv_results_['mean_test_score'][model.best_index_]
    print('Mean cv score: ',mean_cv_score)
    train_score = model.score(train_images, train_labels)
    print('Train score: ', train_score)
    test_score = model.score(test_images, test_labels)
    print('Test score: ', test_score)
    model_time = time.time() - start_time
    return model, mean_cv_score, train_score, test_score, model_time

In [10]:
#Image feature preprocesssing
def feature_process(image_set, processor):
    image_set_prep = list()
    if processor == 'VGG16':
        for image in tqdm_notebook(image_set):
            image_resize = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            features = model_VGG16_prep.predict(image_resize)
            image_set_prep.append(features[0])
    if processor == 'RN50':
        for image in tqdm_notebook(image_set):
            image_resize = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            features = model_RN50_prep.predict(image_resize)
            image_set_prep.append(features[0])
    image_set_prep = np.array(image_set_prep)
    
    if processor == 'Inception':
        for image in tqdm_notebook(image_set):
            image_resize = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            features = model_Inception_prep.predict(image_resize)
            image_set_prep.append(features[0])
    image_set_prep = np.array(image_set_prep)
    
    return image_set_prep

In [11]:
#Set up temporary model summary dataframe
master_model_summary = pd.read_pickle('model_summary.pkl')
temp_model_summary = pd.DataFrame(columns=master_model_summary.columns)

### Pre-Trained Model as Feature Extractor Preprocessor

In [12]:
model_RN50_prep = ResNet50()
model_RN50_prep.layers.pop()
model_RN50_prep = Model(inputs=model_RN50_prep.inputs, 
                         outputs=model_RN50_prep.layers[-1].output)

W1005 16:25:16.397925 4715791808 deprecation.py:506] From /anaconda3/envs/tensor_env/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [13]:
model_Inception_prep = InceptionResNetV2()
model_Inception_prep.layers.pop()
model_Inception_prep = Model(inputs=model_Inception_prep.inputs, 
                         outputs=model_Inception_prep.layers[-1].output)

In [14]:
model_SVM = SVC(kernel = 'rbf', gamma = 'scale')

### Run model for top 20 breeds

In [15]:
### First get most popular breeds:
file_list = sorted(os.listdir('dog_pics/'))
file_list = file_list[1:]

#Set up dataframe with dog breed data
file_breeds = [file[6:].split('.')[0] for file in file_list]
breed_data = pd.DataFrame(file_breeds, columns=['breed'])
breed_data.head()

Unnamed: 0,breed
0,bulldog
1,cross_breed_-_large
2,maltipoo
3,pomchi
4,chihuahua


In [16]:
breed_list = list(breed_data['breed'].value_counts()[0:23].index)
excl_breeds = ['cross_breed_-_medium','cross_breed_-_small','cross_breed_-_large']
for breed in excl_breeds:
    breed_list.remove(breed)
breed_list

['cockapoo',
 'labrador',
 'cocker_spaniel',
 'staffordshire_bull_terrier',
 'jack_russell_terrier',
 'french_bulldog',
 'border_collie',
 'shih_tzu',
 'cavapoo',
 'dachshund_(miniature)',
 'golden_retriever',
 'beagle',
 'pug',
 'english_springer_spaniel',
 'german_shepherd_dog',
 'cavalier_king_charles_spaniel',
 'chihuahua',
 'labradoodle',
 'yorkshire_terrier',
 'miniature_schnauzer']

In [17]:
breed_list = ['cockapoo','labrador', 'cocker_spaniel', 'staffordshire_bull_terrier',
              'jack_russell_terrier', 'french_bulldog', 'shih_tzu', 'border_collie',
              'cavapoo', 'dachshund_(miniature)', 'golden_retriever', 'beagle',
              'pug', 'english_springer_spaniel', 'cavalier_king_charles_spaniel', 'labradoodle',
              'german_shepherd_dog', 'chihuahua', 'yorkshire_terrier', 'miniature_schnauzer']

In [18]:
#Load images
clean = 'n'
threshold = 30
pic_size = (224, 224, 3)
train_images, test_images, train_labels, test_labels, breed_dict, inv_breed_dict, baseline =\
generate_image_set(breed_list, clean, threshold, pic_size)

Loading pictures...


HBox(children=(IntProgress(value=0, max=32459), HTML(value='')))


Setting up numpy array
Number of loaded images:  9719
Number of labels:  9719
{'cockapoo': 0, 'labrador': 1, 'cocker_spaniel': 2, 'staffordshire_bull_terrier': 3, 'jack_russell_terrier': 4, 'french_bulldog': 5, 'shih_tzu': 6, 'border_collie': 7, 'cavapoo': 8, 'dachshund_(miniature)': 9, 'golden_retriever': 10, 'beagle': 11, 'pug': 12, 'english_springer_spaniel': 13, 'cavalier_king_charles_spaniel': 14, 'labradoodle': 15, 'german_shepherd_dog': 16, 'chihuahua': 17, 'yorkshire_terrier': 18, 'miniature_schnauzer': 19}
{0: 'cockapoo', 1: 'labrador', 2: 'cocker_spaniel', 3: 'staffordshire_bull_terrier', 4: 'jack_russell_terrier', 5: 'french_bulldog', 6: 'shih_tzu', 7: 'border_collie', 8: 'cavapoo', 9: 'dachshund_(miniature)', 10: 'golden_retriever', 11: 'beagle', 12: 'pug', 13: 'english_springer_spaniel', 14: 'cavalier_king_charles_spaniel', 15: 'labradoodle', 16: 'german_shepherd_dog', 17: 'chihuahua', 18: 'yorkshire_terrier', 19: 'miniature_schnauzer'}
Setting up train/test split
cockapo

In [19]:
train_images_prep = feature_process(train_images, 'RN50')
test_images_prep = feature_process(test_images, 'RN50')

HBox(children=(IntProgress(value=0, max=7775), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1944), HTML(value='')))




In [20]:
model_SVM = SVC(kernel = 'rbf', gamma = 'scale')

In [21]:
model, mean_cv_score, train_score, test_score, model_time=\
modelling_process(model_SVM, train_images_prep, train_labels, 
                  test_images_prep, test_labels)

CV score:  [0.69487179 0.70025674 0.68272319 0.67824968 0.69534282]
Mean cv score:  0.6902888436650888
Train score:  0.7238585209003215
Test score:  0.7088477366255144


In [22]:
int_test_labels = list(set(test_labels))
int_name_test_labels = [str(label)+' '+inv_breed_dict[label] for label in int_test_labels]

In [23]:
test_preds = model.predict(test_images_prep)
conf_matrix = pd.DataFrame(confusion_matrix(test_labels, test_preds, labels = int_test_labels),
             index = int_name_test_labels, columns = int_test_labels)
conf_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0 cockapoo,257,0,15,0,5,0,5,0,2,0,1,0,1,0,0,4,0,0,0,9
1 labrador,1,178,7,3,6,0,0,0,0,4,4,2,1,0,0,1,3,0,0,0
2 cocker_spaniel,9,3,162,1,0,0,1,0,0,3,1,0,0,11,0,1,0,0,0,1
3 staffordshire_bull_terrier,1,6,2,84,10,2,1,0,0,2,0,0,1,0,0,0,4,2,0,0
4 jack_russell_terrier,6,5,2,9,56,4,1,2,0,0,0,0,2,0,0,1,3,6,1,3
5 french_bulldog,2,0,0,3,4,100,0,0,0,0,0,0,3,0,0,0,2,1,0,0
6 shih_tzu,9,0,1,0,0,0,78,0,0,0,0,0,0,1,0,0,1,1,1,0
7 border_collie,4,1,6,2,4,0,0,60,0,0,2,0,1,0,1,0,2,0,0,0
8 cavapoo,65,0,3,0,1,0,7,0,3,1,1,0,0,0,0,0,0,1,3,0
9 dachshund_(miniature),8,9,9,1,6,0,0,0,0,27,0,5,0,0,0,0,0,0,0,0


In [24]:
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.59      0.86      0.70       299
           1       0.82      0.85      0.84       210
           2       0.62      0.84      0.72       193
           3       0.76      0.73      0.75       115
           4       0.51      0.55      0.53       101
           5       0.91      0.87      0.89       115
           6       0.80      0.85      0.82        92
           7       0.87      0.72      0.79        83
           8       0.60      0.04      0.07        85
           9       0.64      0.42      0.50        65
          10       0.80      0.79      0.79        66
          11       0.83      0.75      0.79        57
          12       0.86      0.85      0.86        67
          13       0.72      0.59      0.65        61
          14       0.96      0.35      0.52        62
          15       0.50      0.12      0.20        56
          16       0.72      0.85      0.78        54
          17       0.71    

### Run 20 iterations combining poodle and spaniel breeds as last breeds.

In [25]:
master_breed_list = ['labrador', 'staffordshire_bull_terrier','jack_russell_terrier', 'french_bulldog', 
                     'shih_tzu', 'border_collie', 'dachshund_(miniature)', 'golden_retriever', 
                     'beagle', 'pug', 'german_shepherd_dog', 'chihuahua', 
                     'yorkshire_terrier', 'miniature_schnauzer', 'bichon_frise', 'border_terrier',
                     'west_highland_white_terrier', 'whippet',
                     'cocker_spaniel', 'english_springer_spaniel', 'cavalier_king_charles_spaniel', 
                     'cavapoo','labradoodle',  'cockapoo']

In [26]:
clean = 'n'
threshold = 30
pic_size = (224, 224, 3)
train_images, test_images, train_labels, test_labels, breed_dict, inv_breed_dict, baseline =\
generate_image_set(master_breed_list, clean, threshold, pic_size)

Loading pictures...


HBox(children=(IntProgress(value=0, max=32459), HTML(value='')))


Setting up numpy array
Number of loaded images:  10483
Number of labels:  10483
{'labrador': 0, 'staffordshire_bull_terrier': 1, 'jack_russell_terrier': 2, 'french_bulldog': 3, 'shih_tzu': 4, 'border_collie': 5, 'dachshund_(miniature)': 6, 'golden_retriever': 7, 'beagle': 8, 'pug': 9, 'german_shepherd_dog': 10, 'chihuahua': 11, 'yorkshire_terrier': 12, 'miniature_schnauzer': 13, 'bichon_frise': 14, 'border_terrier': 15, 'west_highland_white_terrier': 16, 'whippet': 17, 'cocker_spaniel': 18, 'english_springer_spaniel': 19, 'cavalier_king_charles_spaniel': 20, 'cavapoo': 21, 'labradoodle': 22, 'cockapoo': 23}
{0: 'labrador', 1: 'staffordshire_bull_terrier', 2: 'jack_russell_terrier', 3: 'french_bulldog', 4: 'shih_tzu', 5: 'border_collie', 6: 'dachshund_(miniature)', 7: 'golden_retriever', 8: 'beagle', 9: 'pug', 10: 'german_shepherd_dog', 11: 'chihuahua', 12: 'yorkshire_terrier', 13: 'miniature_schnauzer', 14: 'bichon_frise', 15: 'border_terrier', 16: 'west_highland_white_terrier', 17: '

In [27]:
train_labels_combined = [18 if x in [19, 20] else x for x in train_labels]
train_labels_combined = [19 if x in [21, 22, 23] else x for x in train_labels_combined]

In [28]:
test_labels_combined = [18 if x in [19, 20] else x for x in test_labels]
test_labels_combined = [19 if x in [21, 22, 23] else x for x in test_labels_combined]

In [29]:
train_images_prep = feature_process(train_images, 'RN50')
test_images_prep = feature_process(test_images, 'RN50')

HBox(children=(IntProgress(value=0, max=8386), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2097), HTML(value='')))




In [30]:
model_SVM = SVC(kernel = 'rbf', gamma = 'scale')
model, mean_cv_score, train_score, test_score, model_time=\
modelling_process(model_SVM, train_images_prep, train_labels_combined, 
                  test_images_prep, test_labels_combined)

CV score:  [0.74688057 0.74776919 0.75714286 0.74089552 0.74085183]
Mean cv score:  0.746707992916779
Train score:  0.7773670403052707
Test score:  0.7491654744873629


In [31]:
int_test_labels = list(set(test_labels))
int_name_test_labels = [str(label)+' '+inv_breed_dict[label] for label in int_test_labels]

In [32]:
test_preds_combined = model.predict(test_images_prep)

In [33]:
int_test_labels_abbr = int_test_labels[0:20]
int_name_test_labels_abbr = int_name_test_labels[0:20]
int_name_test_labels_abbr[18] = '18 spaniel_crosses'
int_name_test_labels_abbr[19] = '19 poodle_crosses'

In [34]:
conf_matrix = pd.DataFrame(confusion_matrix(test_labels_combined, test_preds_combined, 
                                            labels = int_test_labels_abbr),
             index = int_name_test_labels_abbr, columns = int_test_labels_abbr)
conf_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0 labrador,172,7,2,0,0,0,8,3,3,2,1,0,0,0,0,0,0,2,9,1
1 staffordshire_bull_terrier,8,78,14,2,0,0,3,1,0,3,1,2,0,0,0,0,0,1,1,1
2 jack_russell_terrier,5,10,53,0,1,1,0,0,3,0,3,4,1,3,0,0,4,2,4,7
3 french_bulldog,2,3,1,101,0,0,0,0,0,6,1,0,0,0,0,0,0,0,1,0
4 shih_tzu,0,0,1,0,78,0,0,0,0,0,0,1,0,0,0,0,0,0,3,9
5 border_collie,1,1,4,0,0,56,0,1,0,1,6,2,0,0,0,0,0,0,9,2
6 dachshund_(miniature),8,3,1,0,0,0,26,1,6,0,0,1,0,0,0,0,0,2,10,7
7 golden_retriever,7,0,3,0,0,0,0,42,1,0,0,1,0,0,0,0,0,0,11,1
8 beagle,5,0,0,0,0,0,1,0,48,0,0,1,0,0,0,0,0,0,1,1
9 pug,1,2,1,2,0,1,0,0,0,58,0,0,1,0,0,0,0,0,1,0


In [35]:
print(classification_report(test_labels_combined, test_preds_combined))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       210
           1       0.72      0.68      0.70       115
           2       0.51      0.52      0.52       101
           3       0.95      0.88      0.91       115
           4       0.82      0.85      0.83        92
           5       0.90      0.67      0.77        83
           6       0.60      0.40      0.48        65
           7       0.75      0.64      0.69        66
           8       0.75      0.84      0.79        57
           9       0.81      0.87      0.83        67
          10       0.74      0.74      0.74        54
          11       0.67      0.65      0.66        54
          12       0.71      0.53      0.60        55
          13       0.64      0.82      0.72        55
          14       0.50      0.27      0.35        41
          15       0.86      0.42      0.56        43
          16       0.78      0.82      0.80        39
          17       0.71    