In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
HW = 'humpback-whale-identification'
# TRAIN = '../input/humpback-whale-identification/train/'
TRAIN_CROPPED = "whales-cropped/cropped_train/cropped_train/"
TRAIN_CROPPED_IN = '../input/' + TRAIN_CROPPED

# TEST = '../input/humpback-whale-identification/test/'
TEST_CROPPED = "whales-cropped/cropped_test/cropped_test/"
TEST_CROPPED_IN = '../input/' + TEST_CROPPED

LABELS = '../input/humpback-whale-identification/train.csv'
SAMPLE_SUB = '../input/humpback-whale-identification/sample_submission.csv'

train = pd.read_csv(LABELS)
print("With new_whale:")
train.head()

In [None]:
MODEL_F = 'Model_Xception_flow.h5'
WEIGHTS_F = 'Weights_Xception_flow.h5'
MODEL = '../input/Xception-pretrained/'+ MODEL_F
WEIGHTS = '../input/Xception-pretrained/'+ WEIGHTS_F

In [None]:
train.describe()

In [None]:
import random 
from IPython.display import Image
print("Example whale image")

#show sample image
name = random.choice(train['Image'])
print(name)
Image(filename = TRAIN_CROPPED_IN + name)

In [None]:
train_images = train.set_index('Image')
new_whale_train = train_images[train_images.Id == "new_whale"]  # only new_whale dataset
# whales_train = train_images[~(train_images.Id == "new_whale")]  # no new_whale dataset, used for training
criteria = train['Id'] != 'new_whale'
whales_train = train[criteria]
    
print("Without new_whale:")
whales_train.head()

In [None]:
unique_labels = np.unique(whales_train.Id.values)

In [None]:
whales_train.describe()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mplimg
from matplotlib.pyplot import imshow

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from keras import layers
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator

# from keras.applications.imagenet_utils import preprocess_input
from keras.applications.xception import Xception, preprocess_input

from keras.losses import binary_crossentropy

from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D, GlobalAveragePooling2D
from keras.layers import AveragePooling2D, MaxPooling2D, Dropout
from keras.models import Model

import keras.backend as K
from keras.models import Sequential
from PIL import Image
import gc
import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

%matplotlib inline

In [None]:
IMAGE_HEIGHT = 128
IMAGE_WIDTH = 128
IMAGE_SHAPE = (IMAGE_HEIGHT, IMAGE_WIDTH, 3)

def prepareImages(data, m, dataset):
    print("Preparing images")
    X_train = np.zeros((m, IMAGE_HEIGHT, IMAGE_WIDTH, 3))
    count = 0
    
    for fig in data['Image']:          
        filepath = "../input/"+dataset+"/"+fig
        img = image.load_img(filepath)
        img = img.convert(mode="RGB")
        
        #load images into images of required size
        img = img.resize((IMAGE_HEIGHT, IMAGE_WIDTH))
        x = image.img_to_array(img)
        x = preprocess_input(x)

        X_train[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
            
        count += 1
    
    return X_train

In [None]:
def remove_new_whale():    
    labels_dict = dict()
    labels_list = []

    for i in range(len(unique_labels)):
        labels_dict[unique_labels[i]] = i
        labels_list.append(unique_labels[i])

    print("Number of classes: {}".format(len(unique_labels)))

    print(np.shape(labels_list))
    labels_list = np.array(labels_list)
    return labels_list, labels_dict

In [None]:
labels_list, labels_dict = remove_new_whale()

In [None]:
whales_train.Id = whales_train.Id.apply(lambda x: labels_dict[x])

In [None]:
print(whales_train.head())

In [None]:
def prepare_labels(y):
    values = np.array(y)
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    # print(integer_encoded)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
#     print(integer_encoded)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
#     print(onehot_encoded)

    y = onehot_encoded
    print(y.shape)
    return y, label_encoder

In [None]:
y, label_encoder = prepare_labels(whales_train['Id'])
y.shape

In [None]:
%matplotlib inline
X = prepareImages(whales_train, whales_train.shape[0], TRAIN_CROPPED)
X /= 255

In [None]:
CLASSES = 5004
EPOCHS = 30
BATCH_SIZE = 100

# setup model
base_model = Xception(weights='imagenet', include_top=False, input_shape = IMAGE_SHAPE)

x = base_model.output
x = GlobalAveragePooling2D(name='avg_pool')(x)
x = Dropout(0.4)(x)
predictions = Dense(CLASSES, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)
   
# transfer learning
for layer in base_model.layers:
  layer.trainable = True
      
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
print("Train set shape: "+ str(np.shape(X)))
print(np.shape(whales_train['Id']))
history = model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

In [None]:
# validate
val_set_x = X[1000:6000]
val_set_y = y[1000:6000]
scores = model.evaluate(val_set_x, val_set_y, verbose=1)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
model.save(MODEL_F)
print("Saved model architecture to disk")
model.save_weights(WEIGHTS_F)
print("Saved model weights to disk")

In [None]:
gc.collect()

In [None]:
# from keras.models import load_model

# # returns a compiled model
# # identical to the previous cell
# model = load_model(MODEL)
# print("Loaded model architecture from disk")

# model.load_weights(WEIGHTS)
# print("Loaded model weights from disk")
# model.summary()

# gc.collect()

# Plot train results <a name="flow"></a>
____

In [None]:
acc = history.history['acc']
# val_acc = history.history['val_acc']
    
l1 = plt.plot(acc, label='acc')
# l2 = plt.plot(val_acc, label='val_acc')
plt.legend(loc=2, fontsize="small")
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.show()

In [None]:
loss = history.history['loss']
# val_loss = history.history['val_loss']

l1 = plt.plot(loss, label='loss')
# plt.plot(val_loss, label='val_loss')
plt.legend(loc=2, fontsize="small")
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

In [None]:
test = os.listdir(TEST_CROPPED_IN)
print("Test set length: "+str(len(test)))

In [None]:
col = ['Image']
test_df = pd.DataFrame(test, columns=col)
test_df['Id'] = ''

In [None]:
X = prepareImages(test_df, test_df.shape[0], TEST_CROPPED)
# X /= 255

# Test set prediction using generator and flow_from_dataframe <a name="flow"></a>
____

In [None]:
test_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    rescale=1./255,
    fill_mode='nearest')

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df, 
    directory=TEST_CROPPED_IN, 
    x_col="Image", 
    y_col=None,
    class_mode=None,
    shuffle=False,
    color_mode= "rgb",
    target_size=(IMAGE_HEIGHT, IMAGE_WIDTH), 
    batch_size=1)

#we need to use .reset() here otherwise
#the other of predictions will be different
#then the expected
test_generator.reset()
predictions = model.predict_generator(test_generator,verbose = 1,steps=7960)

print("Predictions shape:")
print(np.shape(predictions))

# Test set predictions <a name="flow"></a>
____

In [None]:
# predictions = model.predict(np.array(X), verbose=1)
# print(np.shape(predictions))


In [None]:
predicted_class_indices=np.argmax(predictions,axis=1)

np.save("predictions.npy", predictions)
np.save("predicted_class_indices.npy", predicted_class_indices)
np.save('test_filenames_generator.npy', test_generator.filenames)
np.save('test_class_indices.npy', test_generator.class_indices)

print('predicted class indices:')
print(predicted_class_indices)

In [None]:
print(labels_list[:7])
labels_with_new_whale = np.concatenate((['new_whale'], labels_list), axis=0)    
print(labels_with_new_whale[:7])

In [None]:
def add_new_whale_to_predictions(preds):
    sorted_preds = np.sort(preds)
    avg_of_max_predictions = np.average(sorted_preds[:, -1:])
    print("Average of max probabilities column:" + str(avg_of_max_predictions))
    best_threshold = avg_of_max_predictions
    # print(np.shape(preds))
    shape_to_add = (np.shape(preds)[0], 1)
    
    # Add a column with the best threshold probability to the predictions
    column_to_add = np.zeros(shape_to_add) + best_threshold
    predictions_w_new_whale = np.concatenate([column_to_add, preds], axis=1)
    return predictions_w_new_whale

In [None]:
def create_results_csv(preds, labels_with_new_whale, test_file_names, output_filename):
    sample_df = pd.read_csv(SAMPLE_SUB)
    sample_images = list(sample_df.Image)
    
    print('Test generator file names:')
    print(test_file_names[:7])
    pred_list = [[labels_with_new_whale[i] for i in p.argsort()[-5:][::-1]] for p in preds]

    pred_dic = dict((key, value) for (key, value) in zip(test_file_names, pred_list))
    pred_list_for_test = [' '.join(pred_dic[i]) for i in sample_images]
    
    print('predictions list shape')
    print(np.shape(pred_list))
    
    df = pd.DataFrame({'Image': sample_images, 'Id': pred_list_for_test})
    df.to_csv(output_filename, header=True, index=False)
    return df

In [None]:
p = add_new_whale_to_predictions(predictions)
test_df = create_results_csv(p, labels_with_new_whale, test_generator.filenames, "submission.csv")
print(test_df[:10])