In [None]:
import os
import pandas as pd
import random
import numpy as np

from keras.preprocessing.image import ImageDataGenerator
from keras.layers import GlobalAveragePooling2D, Dense, BatchNormalization, Dropout
from keras.optimizers import Adam, SGD, RMSprop
from keras.models import Model, Input
from keras.applications import xception
from keras.callbacks import TensorBoard, ModelCheckpoint, Callback
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
base_folder = '/mlsteam/input'
train_folder = os.path.join(base_folder, 'train')
test_folder = os.path.join(base_folder, 'test')

label_file = os.path.join(base_folder, 'labels.csv')

In [None]:
train_label = pd.read_csv(label_file)
NUM_CLASSES = 20

random.seed(NUM_CLASSES)

total_breed = list(train_label.groupby('breed').count().sort_values(by='id', ascending=False).index)

top_num_breed = list(train_label.groupby('breed').count().sort_values(by='id', ascending=False).head(NUM_CLASSES).index)

train_df = pd.DataFrame()
valid_df = pd.DataFrame()

ratio = 0.8
print('{:<20} {:>10} {:>10} {:>10}'.format('Breed', 'Total', 'Train', 'Valid'))
print('#'*60)
for breed in top_num_breed:
    tmp = train_label.loc[train_label['breed'].isin([breed])].reset_index(drop=True)
    train_num = int(len(tmp) * 0.8)
    print('{:<20} {:10} {:10} {:10}'.format(breed, len(tmp), train_num, len(tmp) - train_num))
    
    # random
    tmp_list = list(range(len(tmp)))
    random.shuffle(tmp_list)

    train_df = train_df.append(tmp.iloc[tmp_list[:train_num]], ignore_index=True)
    valid_df = valid_df.append(tmp.iloc[tmp_list[train_num:]], ignore_index=True)

for i, row in train_df.iterrows():
    train_df.at[i, 'id'] = row['id'] + '.jpg'

for i, row in valid_df.iterrows():
    valid_df.at[i, 'id'] = row['id'] + '.jpg'
    

In [None]:

#print(train_df)
#print(valid_df)

In [None]:

train_datagen = ImageDataGenerator(
    #samplewise_center=True,
    #samplewise_std_normalization=True,
    rotation_range=45,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.25,
    horizontal_flip=True,
    fill_mode='nearest',
    rescale=1./255
)

In [None]:
train_generator = train_datagen.flow_from_dataframe(
                        dataframe=train_df,
                        directory=train_folder,
                        x_col="id",
                        y_col="breed",
                        class_mode="categorical",
                        target_size=(299, 299),
                        batch_size=32,
                        shuffle=True)

In [None]:
valid_generator = ImageDataGenerator(rescale=1./255).flow_from_dataframe(
                        dataframe=valid_df,
                        directory=train_folder,
                        x_col="id",
                        y_col="breed",
                        class_mode="categorical",
                        target_size=(299, 299),
                        batch_size=16,
                        shuffle=False)

In [None]:
### MODEL - BOTTLENECK FEATURES - OPTMIZER



# Download and create the pre-trained Xception model for transfer learning
base_model = xception.Xception(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = BatchNormalization()(x)
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dropout(0.5)(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
# and a logistic layer -- let's say we have NUM_CLASSES classes
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional Xception layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
optimizer = RMSprop(lr=0.001, rho=0.9)
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=["accuracy"])
model.summary()

In [None]:
class TrainLogger(Callback):
    def on_epoch_begin(self, epoch, logs={}):
        self.epoch = epoch
    def on_train_batch_end(self, batch, logs={}):
        print("Train epoch={:.6f} loss={:.6f} acc={:.6f}".format(self.epoch+batch/self.params.get('steps'), logs.get('loss'), logs.get('accuracy')))
    def on_epoch_end(self, epoch, logs={}):
        print("Validation epoch={:.6f} loss={:.6f} acc={:.6f}".format(epoch+1.0, logs.get('val_loss'), logs.get('val_accuracy')))
        
tb_callBack = TensorBoard(log_dir='./tb', histogram_freq=0, write_graph=True, write_images=True)
model_checkpoint = ModelCheckpoint(filepath='./checkpoints', monitor='loss', verbose=0, save_best_only=True)

model.fit_generator(train_generator, 
                    epochs=10,
                    steps_per_epoch=train_generator.n // train_generator.batch_size,
                    validation_data=valid_generator,
                    verbose=0, 
                    callbacks=[tb_callBack, model_checkpoint, TrainLogger()])

In [None]:
valid_pred = model.predict_generator(valid_generator, verbose=1)

In [None]:
cnf_matrix = confusion_matrix(valid_generator.labels, np.argmax(valid_pred,axis=1))

# Mapping
breed_mapping = {v: k for k, v in train_generator.class_indices.items()}
breed_list = [b for b in breed_mapping.values()]
df_cm = pd.DataFrame(cnf_matrix, index=breed_list, columns=breed_list)

fig = plt.figure(figsize=(10, 7))
try:
    heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
except ValueError:
    raise ValueError("Confusion matrix values must be integers.")

heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=10)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=10)
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.show()

plt.savefig('confusion_matrix.png')

In [None]:
def get_imgs(path):
    imgs = []
    for entry in os.scandir(path):
        if entry.is_dir():
            imgs.extend(get_imgs(entry.path))
        else:
            imgs.append(entry.path)
    return imgs

test_imgs = get_imgs(test_folder)

test_df = pd.DataFrame({"x":test_imgs[:64]})

test_generator = ImageDataGenerator(rescale=1./255).flow_from_dataframe(
                        test_df,
                        x_col='x',
                        class_mode=None,
                        target_size=(299, 299),
                        batch_size=32,
                        shuffle=False)

In [None]:
pred = model.predict_generator(test_generator, verbose=1)


In [None]:
# Get first batch
test_generator.reset()
first_batch = test_generator.next()
(first_batch_imgs) = first_batch
first_batch_pred = pred[:len(first_batch_imgs)]

def get_max_index(array):
    max = 0
    max_index = 0
    for i in range(len(array)):
        if array[i] > max:
            max = array[i]
            max_index = i
    return max_index

# Mapping
breed_mapping = {v: k for k, v in train_generator.class_indices.items()}

# Start to Plot

fig=plt.figure(figsize=(16, 16))
columns = 4
rows = 5

for i in range(1, columns*rows +1):
    fig.add_subplot(rows, columns, i)
    plt.tick_params(
        bottom=False,
        left=False,
        labelbottom=False,
        labelleft=False
    )
    plt.tight_layout(pad=2, h_pad=0.2, w_pad=0.2)
    plt.title(breed_mapping[get_max_index(first_batch_pred[i-1])])
    plt.imshow(first_batch_imgs[i-1])
plt.show()

plt.savefig('prediction_20.png')