In [None]:
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import random
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import keras
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten

# import third-party library
sys.path.append('./my_lib/')
from data_augmentation import DataAugmentation

In [None]:
# import data
csv_train = pd.read_csv('../input/labels.csv')
csv_test = pd.read_csv('../input/sample_submission.csv')

In [None]:
# read training CSV
csv_train.head(10)

In [None]:
# read test csv
csv_test.head(10)

In [None]:
# Generate Labels
targets_series = pd.Series(csv_train['breed'])
# print(targets_series)
one_hot = pd.get_dummies(targets_series, sparse = True)
labels = np.asarray(one_hot)
n_check = random.randint(0, len(labels)-1)
print(csv_train['breed'][n_check], 'is encoded as', ''.join((str(i) for i in labels[n_check])))

In [None]:
im_size = 90

x_train = []
y_train = []
x_test = []

In [None]:
for i, (f, breed) in enumerate(tqdm(csv_train.values)):
    img = cv2.imread('../input/train/{}.jpg'.format(f))
    x_train.append(cv2.resize(img, (im_size, im_size)))
    y_train.append(labels[i])

Use external module to execute data augmentation.
The module execute:
- [ ] Inversion
- [ ] Sobel derivative
- [ ] Scharr derivative
- [ ] Laplacian <!--**(error not used for now)**-->
- [ ] Blur
- [ ] Gaussian blur [disable]
- [ ] Median blur
- [ ] Bilateral blur
- [x] Horizontal flips
- [x] Rotation

In [None]:
for i, images in enumerate(tqdm(DataAugmentation(x_train,
                                                 options={'inverse': False,
                                                          'sobel_derivative': False,
                                                          'scharr_derivative': False,
                                                          'laplacian': False,
                                                          'blur': False,
                                                          'gaussian_blur': False,
                                                          'median_blur': False,
                                                          'bilateral_blur': False,
                                                          'horizontal_flips': True,
                                                          'rotation': True,
                                                          # 'rotation_config': [(10,1.2)],
                                                          'shuffle_result': False}))):
    for image in images:
        if i == 4:
            plt.imshow(image, cmap = 'gray', interpolation = 'bicubic')
            plt.show()
        x_train.append(image)
        y_train.append(y_train[i])
    
print('dataset became:', len(x_train))

In [None]:
# check train
n_check = random.randint(0, len(y_train)-1)
print('label:', ''.join((str(i) for i in y_train[n_check])))
plt.imshow(x_train[n_check], cmap = 'gray', interpolation = 'bicubic')
plt.show()

In [None]:
for f in tqdm(csv_test['id'].values):
    img = cv2.imread('../input/test/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (im_size, im_size)))

In [None]:
# build np array and normalise them
x_train_raw = np.array(x_train, np.float32) / 255.
y_train_raw = np.array(y_train, np.uint8)
x_test_raw  = np.array(x_test, np.float32) / 255.

In [None]:
print("x_train shape:", x_train_raw.shape)
print("y_train shape:", y_train_raw.shape)
print("x_test shape:", x_test_raw.shape)

In [None]:
num_classes = y_train_raw.shape[1]
classes = csv_test.columns.values[1:]

Using the __stratify__ parameter on __treain_test_split__ the split should be equally distributed per classes.

**TODO:** Add cross validation 

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train_raw, y_train_raw,
                                                      test_size=0.20, random_state=42,
                                                      stratify=y_train_raw)

In [None]:
# Create the base pre-trained model
base_model = VGG16(weights="imagenet", include_top=False, input_shape=(im_size, im_size, 3))

# Add a new top layer
x = base_model.output
x = Flatten()(x)
predictions = Dense(num_classes, activation='softmax')(x)

# This is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = False

model.add(Dense(60, input_dim=60, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(30, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_acc', patience=3, verbose=1)]
model.summary()

In [None]:
history = model.fit(X_train, Y_train, epochs=40, batch_size=48, 
                    validation_data=(X_valid, Y_valid), 
                    callbacks=callbacks_list, verbose=1)

In [None]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
preds = model.predict(x_test_raw, verbose=1)

In [None]:
# check predict
n_check = random.randint(0, len(x_test_raw)-1)
plt.imshow(x_test_raw[n_check], cmap = 'gray_r', interpolation = 'bicubic')
plt.show()
pre = model.predict(np.array([x_test_raw[n_check]]))
arg_max = np.argmax(pre)
print(np.max(pre), arg_max, labels[arg_max])