In [51]:
# copied from last notebook 
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Importing required libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import sklearn
import matplotlib.pyplot as plt

from keras.applications.vgg19 import VGG19
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.preprocessing.image import load_img


In [52]:
# get data ready
y_train = pd.read_csv("/home/mitchell/kaggleData/dogBreedIdentifier/data/labels.csv")

train_image_dir = "/home/mitchell/kaggleData/dogBreedIdentifier/data/train/"
test_image_dir = "/home/mitchell/kaggleData/dogBreedIdentifier/data/test/"

df_test = pd.read_csv('/home/mitchell/kaggleData/dogBreedIdentifier/data/sample_submission.csv')

In [53]:
# view the csv loaded
y_train.head()


Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [54]:
y_train.columns

Index(['id', 'breed'], dtype='object')

In [55]:
#one hot envcoding of the labesl
breed = pd.Series(y_train['breed'])
breed
one_hot = pd.get_dummies(breed, sparse = True)
#one_hot

In [56]:
one_hot_labels = np.asarray(one_hot)
one_hot_labels.shape


(10222, 120)

In [57]:
#create pivto table then plot to see the distribution of the training set
y_plot=pd.pivot_table(y_train, index='breed', aggfunc=len)
y_plot = y_plot.sort_values('id', ascending=False)
#y_plot
#y_plot.head()

In [58]:
#using: 'ls' > trainFilenames.csv 
# in terminal, created a csv with all filenames.

trainFilenames = pd.read_csv("/home/mitchell/kaggleData/dogBreedIdentifier/data/trainFilenames.csv", header=None)
trainFilenames.size

10222

In [59]:
# this line reformats the data into numpy array
trainFilenames=trainFilenames.T.as_matrix()
trainFilenames=trainFilenames.tolist()
#trainFilenames

In [60]:
#list comprehension trick to take list out of a list..
trainFilenames = [y for x in trainFilenames for y in x]
#trainFilenames

In [61]:
# we can use this to join 

train_img_paths = [train_image_dir + s for s in tqdm(trainFilenames)]
#train_img_paths

100%|██████████| 10222/10222 [00:00<00:00, 3201237.62it/s]


In [74]:
# initialize lists and variables
x_train = []
train_labels = []
x_test = []
im_size = 90
img_height=im_size
img_width=im_size
bs = 64

In [63]:
i = 0 
for f, breed in tqdm(y_train.values):
    
    img = load_img(train_image_dir + '{}.jpg'.format(f), target_size=(img_height, img_width))
    #before image is appended it must be converted into array
    img = np.array(img, dtype="int32")
    x_train.append(img)
    
    
    label = one_hot_labels[i]
    train_labels.append(label)
    i += 1

100%|██████████| 10222/10222 [00:13<00:00, 757.37it/s]


In [64]:
for f in tqdm(df_test['id'].values):
    
    img = load_img(test_image_dir + '{}.jpg'.format(f), target_size=(img_height, img_width))
    img = np.array(img, dtype="int32")
    x_test.append(img)

100%|██████████| 10357/10357 [00:13<00:00, 779.79it/s]


In [65]:
#these few lines reformat the data
y_train_raw = np.array(train_labels, np.uint8)
x_train_raw = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

In [66]:
print(x_train_raw.shape)
print(y_train_raw.shape)
print(x_test.shape)

(10222, 90, 90, 3)
(10222, 120)
(10357, 90, 90, 3)


In [67]:
#We can see above that there are 120 different breeds. 
#We can put this in a num_class variable below that can then be used when creating the CNN model.
num_class = y_train_raw.shape[1]

In [68]:
# build a croos val set
X_train, X_val, Y_train, Y_val = train_test_split(x_train_raw, y_train_raw, test_size=0.3, random_state=1)

In [70]:
# Create the base pre-trained model
# Can't download weights in the kernel
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(im_size, im_size, 3))

# Add a new top layer
x = base_model.output
x = Flatten()(x)
predictions = Dense(num_class, activation='softmax')(x)

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = False

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, verbose=1)]
model.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 90, 90, 3)         0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 90, 90, 64)        1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 90, 90, 64)        36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 45, 45, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 45, 45, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 45, 45, 128)       147584   

In [75]:
model.fit(X_train, Y_train, batch_size=bs, epochs=6, validation_data=(X_valid, Y_valid), verbose=1, callbacks=callbacks_list)

Train on 7155 samples, validate on 3067 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f9a7003feb8>