# Init

In [1]:
from data_preprocess import read_and_normalize_train_data, read_and_normalize_test_data2, save_submission
import numpy as np
import time
from vgg16_run import save_model, read_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
img_rows, img_cols = 224, 224
batch_size = 32
nb_epoch = 10

In [3]:
import tensorflow as tf
from keras.utils import plot_model
from keras.applications import VGG16
from keras.models import Model
from keras.layers import Dense, Flatten
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers, Sequential

In [4]:
def vgg16_model():
    base_model = VGG16(include_top=False, input_shape=(224,224,3))
    vgg16_output = base_model.output
    predictions = Dense(10, activation='softmax')(Flatten()(vgg16_output))
    model = Model(inputs=base_model.input, outputs=predictions)
    
    for layer in base_model.layers:
        layer.trainable = False
    
    rmsp_optimizer = optimizers.RMSprop(lr=0.001)
    # model.compile(optimizer=rmsp_optimizer, loss='categorical_crossentropy')
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    return model

In [5]:
from sklearn.model_selection import KFold

In [6]:
model_name='resnet50'
nfolds = 10
kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)

# Build Vgg16 model

In [8]:
from keras import Sequential
from keras.layers import ZeroPadding2D, Convolution2D, MaxPooling2D, Flatten,Dropout, Dense
import os, h5py
def build_my_vgg16(img_width, img_height, weights_path):
    # build the VGG16 model
    model = Sequential()
    model.add(ZeroPadding2D((1, 1), input_shape=(3, img_width, img_height)))

    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    '''
    # load the weights of the VGG16 networks (trained on ImageNet, won the ILSVRC competition in 2014)
    # note: when there is a complete match between your model definition
    # and your weight savefile, you can simply call model.load_weights(filename)
    '''
    # load the weights for each layer
    assert os.path.exists(weights_path), 'Model weights not found (see "weights_path" variable in script).'
    f = h5py.File(weights_path)
    for k in range(f.attrs['nb_layers']):
        if k >= len(model.layers):
            # we don't look at the last (fully-connected) layers in the savefile
            break
        g = f['layer_{}'.format(k)]
        weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])]
        # set the weights to layer-k
        model.layers[k].set_weights(weights)
    f.close()
    print('VGG16 model weights have been successfully loaded.')

    # build a MLP classifier model to put on top of the VGG16 model
    top_model = Sequential()
    # flateen the output of VGG16 model to 2D Numpy matrix (n*D)
    top_model.add(Flatten(input_shape=model.output_shape[1:]))
    # hidden layer of 256 neurons
    top_model.add(Dense(256, activation='relu'))
    # add dropout for the dense layer
    top_model.add(Dropout(0.5))
    # the output layer: we have 10 claases
    top_model.add(Dense(10, activation='softmax'))

    # connect the two models onto the VGG16 net
    model.add(top_model)

    # set the first 25 layers (up to the last conv block) of VGFG16 net to non-trainable (weights will not be updated)
    for layer in model.layers[:25]:
        layer.trainable=False

    # compile the model 
    model.compile(loss = 'categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
    return model

In [10]:
train_datagen = ImageDataGenerator(rescale=1./255, data_format='channels_first')
train_generator = train_datagen.flow_from_directory('data/train', target_size=(224,224), batch_size=batch_size, class_mode='categorical')
validate_datagen = ImageDataGenerator(rescale=1./255, data_format='channels_first')
validate_generator = train_datagen.flow_from_directory('data/validate', target_size=(224,224), batch_size=batch_size, class_mode='categorical')
model_name = 'vgg16'
model = build_my_vgg16(img_height=img_rows, img_width=img_cols, weights_path='data/vgg16_weights.h5')

model.fit_generator(train_generator, steps_per_epoch=2000, epochs=5, validation_data=validate_generator, validation_steps=800)
save_model(model, 100, model_name)

Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.




ValueError: Layer weight shape (3, 3, 3, 64) not compatible with provided weight shape (64, 3, 3, 3)

# Train use ImageDataGenerator

In [7]:
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory('data/train', target_size=(224,224), batch_size=batch_size, class_mode='categorical')

Found 17943 images belonging to 10 classes.


In [8]:
validate_datagen = ImageDataGenerator(rescale=1./255)
validate_generator = train_datagen.flow_from_directory('data/validate', target_size=(224,224), batch_size=batch_size, class_mode='categorical')

Found 4481 images belonging to 10 classes.


In [9]:
model_name = 'vgg16'
model = vgg16_model()
print(model)
model.fit_generator(train_generator, steps_per_epoch=2000, epochs=5, validation_data=validate_generator, validation_steps=800)
save_model(model, 100, model_name)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
<keras.engine.training.Model object at 0x7fc7e88fa160>
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

In [14]:
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory('data/test', target_size=(224,224), batch_size=batch_size)

Found 79726 images belonging to 1 classes.


['c0/img_10.jpg',
 'c0/img_100.jpg',
 'c0/img_1000.jpg',
 'c0/img_100000.jpg',
 'c0/img_100001.jpg',
 'c0/img_100002.jpg',
 'c0/img_100003.jpg',
 'c0/img_100004.jpg',
 'c0/img_100005.jpg']

In [15]:
test_ids = test_generator.filenames
test_ids = [d.strip('c0/') for d in test_ids]

In [17]:
model = read_model(100, model_name)
test_prediction = model.predict_generator(test_generator, verbose=1)
save_submission(test_ids, np.array(yfull_test))



In [12]:
test_paths = ['data/test/p'+str(i) for i in range(1,10)]

from vgg16_run import merge_several_folds_mean
print('Start testing............')

nfolds = 2
yfull_test = []
test_ids = []
for test_path in test_paths:
    test_data, test_id = read_and_normalize_test_data2(img_rows, img_cols, test_path, 3)
    print(test_data.shape)
    test_ids = test_ids + test_id
    
    test_data = np.transpose(test_data, (0,2,3,1))
    model = read_model(100, model_name)
    test_prediction = model.predict(test_data, batch_size=128, verbose=1)
    yfull_test = yfull_test + test_prediction.tolist()    
    

info_string = 'loss_' + model_name \
    + '_r_' + str(img_rows) \
    + '_c_' + str(img_cols) \
    + '_folds_' + str(100) \
    + '_ep_' + str(nb_epoch)

save_submission(test_ids, np.array(yfull_test))

  0%|          | 36/10381 [00:00<00:29, 352.75it/s]

Start testing............


100%|██████████| 10381/10381 [00:26<00:00, 396.94it/s]


Read test data time: 26.18 seconds
Directory doesn't exists
(10381, 3, 224, 224)


100%|██████████| 8666/8666 [00:21<00:00, 394.40it/s]


Read test data time: 22.0 seconds
Directory doesn't exists
(8666, 3, 224, 224)


100%|██████████| 8614/8614 [00:21<00:00, 404.83it/s]


Read test data time: 21.3 seconds
Directory doesn't exists
(8614, 3, 224, 224)


100%|██████████| 8669/8669 [00:21<00:00, 401.93it/s]


Read test data time: 21.6 seconds
Directory doesn't exists
(8669, 3, 224, 224)


100%|██████████| 8683/8683 [00:21<00:00, 402.34it/s]


Read test data time: 21.61 seconds
Directory doesn't exists
(8683, 3, 224, 224)


100%|██████████| 8676/8676 [00:21<00:00, 402.20it/s]


Read test data time: 21.59 seconds
Directory doesn't exists
(8676, 3, 224, 224)


100%|██████████| 8659/8659 [00:21<00:00, 400.40it/s]


Read test data time: 21.65 seconds
Directory doesn't exists
(8659, 3, 224, 224)


100%|██████████| 8670/8670 [00:21<00:00, 399.47it/s]


Read test data time: 21.73 seconds
Directory doesn't exists
(8670, 3, 224, 224)


100%|██████████| 8708/8708 [00:22<00:00, 395.30it/s]


Read test data time: 22.05 seconds
Directory doesn't exists
(8708, 3, 224, 224)


# Test

In [7]:
test_paths = ['data/test/p'+str(i) for i in range(1,10)]

In [11]:
from vgg16_run import merge_several_folds_mean
print('Start testing............')

nfolds = 2
yfull_test = []
test_ids = []
for test_path in test_paths:
    test_data, test_id = read_and_normalize_test_data2(img_rows, img_cols, test_path, 3)
    print(test_data.shape)
    test_ids = test_ids + test_id
    y_test = []
    test_data = np.transpose(test_data, (0,2,3,1))
    for index in range(1, nfolds+1):
        # Store test predictions
        model = read_model(index, model_name)
        test_prediction = model.predict(test_data, batch_size=128, verbose=1)
        y_test.append(test_prediction)
    yfull_test = yfull_test + merge_several_folds_mean(y_test, nfolds)
    print('yfull_test.len:', len(yfull_test))

info_string = 'loss_' + model_name \
    + '_r_' + str(img_rows) \
    + '_c_' + str(img_cols) \
    + '_folds_' + str(nfolds) \
    + '_ep_' + str(nb_epoch)

save_submission(test_ids, yfull_test)

  0%|          | 36/10381 [00:00<00:28, 358.63it/s]

Start testing............


100%|██████████| 10381/10381 [00:26<00:00, 395.89it/s]


Read test data time: 26.25 seconds
Directory doesn't exists
(10381, 3, 224, 224)


  0%|          | 32/8666 [00:00<00:27, 313.98it/s]

yfull_test.len: 10381


100%|██████████| 8666/8666 [00:23<00:00, 368.22it/s]


Read test data time: 23.56 seconds
Directory doesn't exists
(8666, 3, 224, 224)


  0%|          | 32/8614 [00:00<00:27, 317.66it/s]

yfull_test.len: 19047


100%|██████████| 8614/8614 [00:23<00:00, 369.49it/s]


Read test data time: 23.34 seconds
Directory doesn't exists
(8614, 3, 224, 224)


  0%|          | 0/8669 [00:00<?, ?it/s]

yfull_test.len: 27661


100%|██████████| 8669/8669 [00:23<00:00, 365.60it/s]


Read test data time: 23.74 seconds
Directory doesn't exists
(8669, 3, 224, 224)


  0%|          | 30/8683 [00:00<00:28, 298.50it/s]

yfull_test.len: 36330


100%|██████████| 8683/8683 [00:23<00:00, 366.60it/s]


Read test data time: 23.71 seconds
Directory doesn't exists
(8683, 3, 224, 224)


  0%|          | 32/8676 [00:00<00:27, 318.22it/s]

yfull_test.len: 45013


100%|██████████| 8676/8676 [00:24<00:00, 361.48it/s]


Read test data time: 24.03 seconds
Directory doesn't exists
(8676, 3, 224, 224)


  0%|          | 31/8659 [00:00<00:28, 303.65it/s]

yfull_test.len: 53689


100%|██████████| 8659/8659 [00:23<00:00, 368.47it/s]


Read test data time: 23.53 seconds
Directory doesn't exists
(8659, 3, 224, 224)


  0%|          | 29/8670 [00:00<00:29, 289.94it/s]

yfull_test.len: 62348


100%|██████████| 8670/8670 [00:23<00:00, 364.93it/s]


Read test data time: 23.78 seconds
Directory doesn't exists
(8670, 3, 224, 224)


  0%|          | 33/8708 [00:00<00:26, 323.38it/s]

yfull_test.len: 71018


100%|██████████| 8708/8708 [00:23<00:00, 365.99it/s]


Read test data time: 23.82 seconds
Directory doesn't exists
(8708, 3, 224, 224)
yfull_test.len: 79726


FileNotFoundError: [Errno 2] No such file or directory: 'result/submission_2018-05-05_22:09:05.csv'

In [14]:
yfull_test= np.array(yfull_test)
save_submission(test_ids, yfull_test)

In [1]:
import os
import numpy as np
import shutil
def split_train_data(train_path, validate_path, ratio=0.2):
    np.random.seed(42)
    for c in os.listdir(train_path):
        vpath = os.path.join(validate_path, c)
        tpath = os.path.join(train_path, c)
        os.mkdir(vpath)
        files = os.listdir(tpath)
        size = len(files)
        perm = np.random.permutation(size)
        selected = perm[:int(size*ratio)]
        for idx in selected:
            f = files[idx]
            tfile = os.path.join(tpath, f)
            shutil.move(tfile, vpath)
        print('classes {} done'.format(c))

In [3]:
split_train_data('data/train', 'data/validate')

classes c4 done
classes c0 done
classes c5 done
classes c3 done
classes c7 done
classes c1 done
classes c2 done
classes c6 done
classes c9 done
classes c8 done
