# Import Data & Packages

In [18]:
import time 
import matplotlib.pyplot as plt
import scipy
import numpy as np

from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
import os, shutil
import pickle
from shutil import copyfile


import talos


from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.activations import *



np.random.seed(123)

In [19]:
#import the datasets

train_y = pickle.load(open("train_y.pkl", "rb" ) )
test_y = pickle.load(open("test_y.pkl", "rb" ) )
val_y = pickle.load(open("val_y.pkl", "rb" ) )

train_img = pickle.load(open("train_img.pkl", "rb" ) )
test_img = pickle.load(open("test_img.pkl", "rb" ) )
val_img = pickle.load(open("val_img.pkl", "rb" ) )

train_images = pickle.load(open("train_images.pkl", "rb" ) )
test_images = pickle.load(open("test_images.pkl", "rb" ) )
val_images = pickle.load(open("validate_images.pkl", "rb" ) )

train_labels = pickle.load(open("train_labels.pkl", "rb" ) )
test_labels = pickle.load(open("test_labels.pkl", "rb" ) )
val_labels = pickle.load(open("val_labels.pkl", "rb" ) )

# Combining data with Stanford Dog Dataset

Given that our dataset has relatively few number of images per class, we'll combine the images we scraped with the "Stanford Dog Dataset" </http://vision.stanford.edu/aditya86/ImageNetDogs/>.

After download, the folder names were cleaned to align with our existing data. We also removed any folders containing images for breeds not included in our original dataset. Furthermore, not all 50 breeds are included in the Stanford Dog Dataset. 

In [20]:
dog_breeds = ['Akitas','Australian Shepherds','Basset Hounds','Beagles','Belgian Malinois','Bernese Mountain',
              'Bichons Frises','Bloodhounds','Border Collies','Boston Terriers','Boxers','Brittanys','Bulldogs',
              'Cane Corso','Cavalier King Charles Spaniels','Chesapeake Bay Retrievers','Chihuahuas',
              'Cocker Spaniels','Dachshunds','Dalmatians','Doberman Pinschers','English Cocker Spaniels',
              'English Springer Spaniels','French Bulldogs','German Shepherd','German Shorthaired Pointers',
              'Golden Retrievers','Great Danes','Havanese','Labrador Retrievers','Maltese','Mastiffs',
              'Miniature American Shepherds','Miniature Schnauzers','Newfoundlands','Pembroke Welsh Corgis',
              'Pomeranians','Poodles','Portuguese Water Dogs','Pugs','Rhodesian Ridgebacks','Rottweilers',
              'Shetland Sheepdogs','Shiba Inu','Shih Tzu','Siberian Huskies','Vizslas','Weimaraners',
              'West Highland White Terriers','Yorkshire Terriers']

In [30]:
new_dir = 'split_data_with_stanford/'
train = os.path.join(new_dir, 'train')
test = os.path.join(new_dir, 'test')
validate = os.path.join(new_dir, 'validate')

os.mkdir(new_dir)
os.mkdir(train)
os.mkdir(test)
os.mkdir(validate)

for breed in dog_breeds:
    os.mkdir(os.path.join(train, breed))
    os.mkdir(os.path.join(test, breed))
    os.mkdir(os.path.join(validate, breed))

In [31]:
#We will copy our Dog Image folder, create sub-folders for each breed and then combine the breed-specific folders

new_dir = 'dog-images-by-breed/'
os.mkdir(new_dir)


In [32]:
#Limiting dog breeds to those available in both datasets

stanford_dog_data = './stanford-dog-Images'

dog_breeds = list(os.listdir(stanford_dog_data))
print(len(dog_breeds), " breeds in common")



37  breeds in common


In [33]:
len(dog_breeds)

37

In [34]:
# #Make folders for each breed

for breed in dog_breeds:
    os.mkdir(os.path.join(new_dir,breed))

In [35]:
#Combining images into destination folders

destination_dir = './dog-images-by-breed/'
original_data_dir = './Dog Images/'
stanford_dog_data_dir = './stanford-dog-Images'

for breed in dog_breeds:
    
    count = 1
    clean_breed = breed.replace(" ","_")
    
    #moving original data into new folders, filtering only for chosen breeds
    
    for file in os.listdir(original_data_dir):
        if file.startswith(clean_breed+"_"):
            path = original_data_dir+file
            #defining destination so that the file is also re-named
            dst = destination_dir+breed+"/"+clean_breed+ "_" + str(count)+".png"
            copyfile(path, dst)
            count += 1
        else:
            pass
        
    #moving stanford images into new folders
    
    count = len(os.listdir(destination_dir+'/'+breed)) + 1
    
    for file in os.listdir(stanford_dog_data_dir+'/'+breed):
        
        path = stanford_dog_data_dir+'/'+breed+'/'+file
        #defining destination so that the file is also re-named
        dst = destination_dir+breed+"/"+clean_breed+ "_" + str(count)+".png"
        copyfile(path, dst)
        count += 1
          

In [36]:
image_count = []

for breed in dog_breeds:
    count = 0 
    for file in os.listdir('./dog-images-by-breed/'+breed):
        if file.startswith((breed.replace(" ","_"))):
            count += 1
        else: 
            pass
    image_count.append(count)

In [37]:
breed_img_count_dict = dict(zip(dog_breeds, image_count))

In [38]:
#Determining the lowest number of images returned
#will limit all breeds to use this number to prevent class imbalance

lowest_img_count = min(breed_img_count_dict, key=breed_img_count_dict.get)
print(str(breed_img_count_dict[lowest_img_count]) + " images")

629 images


In [None]:
#629 is an increase from the original 478 images we were using previously

#train, test, validate sizes

#train = 377 ~60%
#test = 94 ~15%
#validate = 158 ~25%

# Splitting images into sub-folders

In [39]:
combined_imgs_dir = './dog-images-by-breed/'

destination = './split_data_with_stanford/'

train_dir = './split_data_with_stanford/train'
test_dir = './split_data_with_stanford/test'
validate_dir = './split_data_with_stanford/validate'

In [40]:
for breed in dog_breeds:
    
    for file in sorted(os.listdir(combined_imgs_dir+breed))[:158]:
        path = combined_imgs_dir+breed+'/'+file
        dst = validate_dir+'/'+breed+'/'+file
        copyfile(path, dst)
        
    for file in sorted(os.listdir(combined_imgs_dir+breed))[159:253]:
        path = combined_imgs_dir+breed+'/'+file
        dst = test_dir+'/'+breed+'/'+file
        copyfile(path, dst)

    for file in sorted(os.listdir(combined_imgs_dir+breed))[160:537]:
        path = combined_imgs_dir+breed+'/'+file
        dst = train_dir+'/'+breed+'/'+file
        copyfile(path, dst)

## Checking counts & defining batch size

In [41]:
#train expected 

37*377

13949

In [42]:
count = 0

for breed in dog_breeds:
    
    for file in os.listdir(train_dir+'/'+breed):
        count += 1

train_batch_size = count
print(count)

13949


In [43]:
#test expected 

37*94

3478

In [44]:
count = 0

for breed in dog_breeds:
    
    for file in os.listdir(test_dir+'/'+breed):
        count += 1

test_batch_size = count
print(count)

3478


In [45]:
#validate expected 

37*158

5846

In [46]:
count = 0

for breed in dog_breeds:
    
    for file in os.listdir(validate_dir+'/'+breed):
        count += 1

validate_batch_size = count
print(count)

5846


# Prep for Modelling

In [47]:
#resizing images

#test
test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        test_dir, 
        target_size=(150, 150), batch_size = test_batch_size) 

#validate
val_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        validate_dir, 
        target_size=(150, 150), batch_size = validate_batch_size)

#train
train_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
        train_dir, 
        target_size=(150, 150), batch_size=train_batch_size)

Found 3478 images belonging to 37 classes.
Found 5846 images belonging to 37 classes.
Found 13949 images belonging to 37 classes.


In [48]:
# create the data sets
train_images, train_labels = next(train_generator)
test_images, test_labels = next(test_generator)
validate_images, validate_labels = next(val_generator)

In [49]:
# Explore your dataset again
m_train = train_images.shape[0]
num_px = train_images.shape[1]
m_test = test_images.shape[0]
m_val = validate_images.shape[0]

print ("Number of training samples: " + str(m_train))
print ("Number of testing samples: " + str(m_test))
print ("Number of validation samples: " + str(m_val))
print ("train_images shape: " + str(train_images.shape))
print ("train_labels shape: " + str(train_labels.shape))
print ("test_images shape: " + str(test_images.shape))
print ("test_labels shape: " + str(test_labels.shape))
print ("val_images shape: " + str(validate_images.shape))
print ("val_labels shape: " + str(validate_labels.shape))

Number of training samples: 13949
Number of testing samples: 3478
Number of validation samples: 5846
train_images shape: (13949, 150, 150, 3)
train_labels shape: (13949, 37)
test_images shape: (3478, 150, 150, 3)
test_labels shape: (3478, 37)
val_images shape: (5846, 150, 150, 3)
val_labels shape: (5846, 37)


In [50]:
train_img = train_images.reshape(train_images.shape[0], -1)
test_img = test_images.reshape(test_images.shape[0], -1)
val_img = validate_images.reshape(validate_images.shape[0], -1)

print(train_img.shape)
print(test_img.shape)
print(val_img.shape)

(13949, 67500)
(3478, 67500)
(5846, 67500)


In [51]:
train_y = np.reshape(train_labels[:,0], (13949,1))
test_y = np.reshape(test_labels[:,0], (3478,1))
val_y = np.reshape(validate_labels[:,0], (5846,1))

In [52]:
#export the datasets for later use

import pickle

pickle.dump(train_y, open("train_stanford_y.pkl", "wb"))
pickle.dump(test_y, open("test_stanford_y.pkl", "wb"))
pickle.dump(val_y, open("val_stanford_y.pkl", "wb"))

pickle.dump(train_img, open("train_stanford_img.pkl", "wb"))
pickle.dump(test_img, open("test_stanford_img.pkl", "wb"))
pickle.dump(val_img, open("val_stanford_img.pkl", "wb"))

pickle.dump(train_images, open("train_stanford_images.pkl", "wb"))
pickle.dump(test_images, open("test_stanford_images.pkl", "wb"))
pickle.dump(validate_images, open("validate_stanford_images.pkl", "wb"))

pickle.dump(train_labels, open("train_stanford_labels.pkl", "wb"))
pickle.dump(test_labels, open("test_stanford_labels.pkl", "wb"))
pickle.dump(validate_labels, open("val_stanford_labels.pkl", "wb"))