## Xception model training

### import libs

In [1]:
import os
#-------------------------- set gpu using tf ---------------------------
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
#-------------------  start importing keras module ---------------------
import keras.backend.tensorflow_backend as K
K.set_session(session)

from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
from keras.utils.training_utils import multi_gpu_model
from keras import optimizers
from keras import regularizers
from multiprocessing import cpu_count

Using TensorFlow backend.


In [2]:
nb_classes = 13
nb_cpus = 8
nb_gpus = 4
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1, 2, 3'

image_size = (299, 299)
input_shape= (299, 299, 3)

train_path = "/home/hdd0/Develop/liyu/batch6.4-608-to-299/original-hls09-rotated/train"
valid_path = "/home/hdd0/Develop/liyu/batch6.4-608-to-299/original-hls09-rotated/valid"

### pretrain model

In [3]:
with tf.device('/cpu:0'):
    input_tensor = Input(input_shape)
    x = Lambda(xception.preprocess_input)(input_tensor)

    base_model = Xception(input_tensor=x, weights='imagenet', include_top=False)
    m_out = base_model.output
    p_out = GlobalAveragePooling2D()(m_out)
    p_out = Dropout(0.5)(p_out)
    predictions = Dense(nb_classes, activation='softmax', name="predictions")(p_out)

    for layer in base_model.layers:
        layer.trainable = False

    model = Model(inputs=base_model.input, outputs=predictions)
    
#     model.load_weights("Xception_first_train.h5")

if nb_gpus > 1:
    parallel_model = multi_gpu_model(model, gpus=nb_gpus)
else:
    parallel_model = model
    
# optimizer = optimizers.SGD(lr=0.01, momentum=0.9, decay=0.0003)
parallel_model.compile(optimizer="Adadelta", 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 299, 299, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         lambda_1[0][0]                   
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_con

In [4]:
# model.save_weights('batch6.1_finetune_010.h5')

In [5]:
batch_size = 128
epochs = 1

In [6]:
img_gen_t = ImageDataGenerator()
train_generator = img_gen_t.flow_from_directory(train_path, 
                                                target_size=image_size, shuffle=True, batch_size=batch_size)

img_gen_v = ImageDataGenerator()
valid_generator = img_gen_v.flow_from_directory(valid_path,
                                                target_size=image_size, shuffle=True, batch_size=batch_size)

parallel_model.fit_generator(generator=train_generator, 
                             steps_per_epoch=len(train_generator), 
                             epochs=epochs, 
                             verbose=1,
                             validation_data=valid_generator, 
                             validation_steps=len(valid_generator), 
                             workers=nb_cpus, 
                             use_multiprocessing=True)

model.save_weights("Xception_first_train.h5")

Found 10469560 images belonging to 13 classes.
Found 1154324 images belonging to 13 classes.
Epoch 1/1


### train model

In [5]:
with tf.device('/cpu:0'):
    input_tensor = Input(input_shape)
    x = Lambda(xception.preprocess_input)(input_tensor)

    base_model = Xception(input_tensor=x, weights=None, include_top=False)
    m_out = base_model.output
    p_out = GlobalAveragePooling2D()(m_out)
    p_out = Dropout(0.5)(p_out)
    predictions = Dense(nb_classes, activation='softmax', name="predictions")(p_out)

    model = Model(inputs=base_model.input, outputs=predictions)
    
    model.load_weights("batch6.3_epoch011.h5")

if nb_gpus > 1:
    parallel_model = multi_gpu_model(model, gpus=nb_gpus)
else:
    parallel_model = model
    
optimizer = optimizers.SGD(lr=0.002, momentum=0.9, decay=0.0003)
parallel_model.compile(optimizer="Adadelta", 
                       loss='categorical_crossentropy', 
                       metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 299, 299, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         lambda_6[0][0]                   
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_con

In [6]:
# parallel_model.load_weights('batch6.3_epoch011.h5')

In [None]:
batch_size = 64
epochs = 100

In [10]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard

img_gen_t = ImageDataGenerator(rotation_range=30,                            
                               width_shift_range=0.1,
                               height_shift_range=0.1,
                               zoom_range=0.1,
                               brightness_range=[0.8, 1.2],
                               horizontal_flip=True,
                               vertical_flip=True)
train_generator = img_gen_t.flow_from_directory(train_path, 
                                                target_size=image_size, shuffle=True, batch_size=batch_size)

img_gen_v = ImageDataGenerator()
valid_generator = img_gen_v.flow_from_directory(valid_path,
                                                target_size=image_size, shuffle=True, batch_size=batch_size)

checkpoint = ModelCheckpoint("batch6.3_{epoch:03d}_{val_loss:.4f}.hdf5", monitor='val_loss', verbose=1,
                             save_best_only=False, save_weights_only=True, mode='min', period=1)

tensorboard = TensorBoard(log_dir="./logs", histogram_freq=0, batch_size=batch_size, write_graph=True, write_images=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=0.000001)

callbacks = [checkpoint, tensorboard, reduce_lr]


parallel_model.fit_generator(generator=train_generator, 
                             steps_per_epoch=len(train_generator), 
                             epochs=epochs, 
                             verbose=1,
                             validation_data=valid_generator, 
                             validation_steps=len(valid_generator), 
                             callbacks=callbacks, 
                             workers=nb_cpus, 
                             use_multiprocessing=True, 
                             initial_epoch=11
                             )

Found 10469560 images belonging to 13 classes.
Found 1154324 images belonging to 13 classes.
Epoch 12/100

Process ForkPoolWorker-206:
Process ForkPoolWorker-203:
Process ForkPoolWorker-202:
Process ForkPoolWorker-205:
Traceback (most recent call last):
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
KeyboardInterrupt
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/unicorn/.conda/envs/algo-work/lib/py

KeyboardInterrupt: 

  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/site-packages/keras_preprocessing/image.py", line 1744, in _get_batches_of_transformed_samples
    batch_x[i] = x
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/site-packages/keras_preprocessing/image.py", line 1268, in __getitem__
    return self._get_batches_of_transformed_samples(index_array)
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/site-packages/keras_preprocessing/image.py", line 1268, in __getitem__
    return self._get_batches_of_transformed_samples(index_array)
KeyboardInterrupt
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/site-packages/keras_preprocessing/image.py", line 1742, in _get_batches_of_transformed_samples
    x = self.image_data_generator.apply_transform(x, params)
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/site-packages/keras_preprocessing/image.py", line 1739, in _get_batches_of_transformed_samples
    interpolation=self.interpolation)
Traceback (most r

In [11]:
model.save_weights("batch6.3_epoch012.h5")