In [1]:
import os
import operator
import numpy as np
from glob import glob
from shutil import copyfile, move
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
%pwd

'/home/patrick/Dev/Kaggle/ncfm'

In [3]:
%cd $HOME_DIR

[Errno 2] No such file or directory: '$HOME_DIR'
/home/patrick/Dev/Kaggle/ncfm


In [4]:
MODEL_NAME = "no_name"

In [5]:
USE_SAMPLE = False

In [6]:
current_dir = os.getcwd()
HOME_DIR = current_dir

if USE_SAMPLE:
    DATA_DIR = current_dir + "/data/sample"
else:
    DATA_DIR = current_dir + "/data"

In [7]:
%cd $DATA_DIR/train
for d in glob('*'):
    f = glob(d + '/*jpg')
    print(d.split('/')[0], len(f))
    
print('\nSum:', len(glob('*/*.jpg')))

/home/patrick/Dev/Kaggle/ncfm/data/train
SHARK 141
NoF 372
YFT 588
DOL 94
ALB 1376
BET 160
LAG 54
OTHER 240

Sum: 3025


In [8]:
%cd $DATA_DIR/valid
for d in glob('*'):
    f = glob(d + '/*jpg')
    print(d.split('/')[0], len(f))
    
print('\nSum:', len(glob('*/*.jpg')))

/home/patrick/Dev/Kaggle/ncfm/data/valid
SHARK 35
NoF 93
YFT 146
DOL 23
ALB 343
BET 40
LAG 13
OTHER 59

Sum: 752


# Model

In [68]:
from keras.utils.np_utils import to_categorical
from sklearn.metrics import log_loss

In [69]:
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D, Input, AveragePooling2D
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator

In [70]:
SIZE = (320,320)

input_tensor = Input(shape=(*SIZE, 3))

base_model = VGG16(include_top=False, input_tensor=input_tensor)

for layer in base_model.layers:
    layer.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
#x = Flatten()(x)
#x = Dense(512, activation='relu')(x)
x = Dense(512, activation='relu')(x)
predictions = Dense(8, activation='softmax')(x)

model = Model(input=base_model.input, output=predictions)

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 320, 320, 3)   0                                            
____________________________________________________________________________________________________
block1_conv1 (Convolution2D)     (None, 320, 320, 64)  1792        input_2[0][0]                    
____________________________________________________________________________________________________
block1_conv2 (Convolution2D)     (None, 320, 320, 64)  36928       block1_conv1[0][0]               
____________________________________________________________________________________________________
block1_pool (MaxPooling2D)       (None, 160, 160, 64)  0           block1_conv2[0][0]               
___________________________________________________________________________________________

In [101]:
import keras.backend as K


def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    y_pred = K.clip(y_pred, eps, 1 - eps)
    y_pred /= K.sum(y_pred, axis=1)[:, np.newaxis]
    loss = -K.sum((y_true * K.log(y_pred)), axis=1)
    return {'log loss': K.mean(loss)}

In [102]:
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['categorical_accuracy', 'fbeta_score', multiclass_log_loss])

In [103]:
N_EPOCH = 1
BATCH_SIZE = 32

idg = ImageDataGenerator(preprocessing_function=lambda frame: preprocess_input(np.expand_dims(frame, 0)))

train_flow = idg.flow_from_directory(DATA_DIR + '/train', shuffle=True, batch_size=BATCH_SIZE, target_size=SIZE)
valid_flow = idg.flow_from_directory(DATA_DIR + '/valid', shuffle=True, batch_size=BATCH_SIZE, target_size=SIZE)
test_flow = idg.flow_from_directory(DATA_DIR + '/test', shuffle=False, batch_size=BATCH_SIZE, target_size=SIZE)

Found 3025 images belonging to 8 classes.
Found 752 images belonging to 8 classes.
Found 1000 images belonging to 1 classes.


In [104]:
model.fit_generator(train_flow, train_flow.nb_sample, nb_epoch=1,
                    validation_data=valid_flow, nb_val_samples=valid_flow.nb_sample)

Epoch 1/1


<keras.callbacks.History at 0x7f8a50822668>

In [105]:
model.optimizer.lr=0.1
model.fit_generator(train_flow, train_flow.nb_sample, nb_epoch=4,
                    validation_data=valid_flow, nb_val_samples=valid_flow.nb_sample)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f8a507548d0>

In [110]:
model.optimizer.lr=0.01
model.fit_generator(train_flow, train_flow.nb_sample, nb_epoch=4,
                    validation_data=valid_flow, nb_val_samples=valid_flow.nb_sample)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f8a505e78d0>

In [111]:
x_valid = []
y_valid = [] 
for i in range(valid_flow.nb_sample // BATCH_SIZE):
    x, y = next(valid_flow)
    x_valid.append(x)
    y_valid.append(y)

x_valid = np.concatenate(x_valid, axis=0)
y_valid = np.concatenate(y_valid, axis=0)

In [112]:
pred_valid = model.predict(x_valid)

In [113]:
log_loss(y_valid, pred_valid)

0.32858472797930949

In [114]:
pred = model.predict_generator(test_flow, test_flow.nb_sample)
pred_clip = np.copy(pred)
pred_clip = pred_clip.clip(min=0.02, max=0.98)

In [115]:
files = [f.split('/')[1] for f in test_flow.filenames]
columns = list(zip(*sorted(train_flow.class_indices.items(), key=operator.itemgetter(1))))[0]
submission = pd.DataFrame(data=pred_clip, index=files, columns=columns)
submission.index.name = 'image'

submission.head()

Unnamed: 0_level_0,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
img_04814.jpg,0.02,0.02,0.98,0.02,0.02,0.02,0.02,0.02
img_03896.jpg,0.98,0.02,0.02,0.02,0.02,0.02,0.02,0.02
img_00030.jpg,0.881836,0.02,0.02,0.02,0.111037,0.02,0.02,0.02
img_03355.jpg,0.284898,0.02,0.02,0.02,0.02,0.02,0.02,0.688395
img_03853.jpg,0.98,0.02,0.02,0.02,0.02,0.02,0.02,0.02


In [25]:
submission.to_csv(DATA_DIR + '/result/sub_%s.csv' % MODEL_NAME)