In [1]:
import gc
import os
import warnings
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import multiprocessing

from keras import backend as K
warnings.filterwarnings(action='ignore')

K.image_data_format()

Using TensorFlow backend.


'channels_last'

In [2]:
# 더 자세한 내용은
# 케라스 코리아 김태영 님의 자료 https://tykimos.github.io/2017/06/10/CNN_Data_Augmentation/
# 를 추천합니다.

params = {
    # Generator Parameter
    'random_state': 2,
    # 회전하는 최대 각도
    'rotation_range': 10,
    # 좌우로 이동할 최대 비율
    'width_shift_range': 0.20,
    # 상하로 이동할 최대 비율
    'height_shift_range': 0.20,
    # 회전 및 밀림 값의 최대 라디안
    'shear_range': 0.50,
    # 축소/확대 할 최대 비율
    'zoom_range': 0.20,
    'horizontal_flip': True,
    'brightness_range': (0.7, 1.5),
    # Model Parameter
    # xception 의 경우 (299, 299)를 많이 사용합니다.
    'img_size': (299, 299),
    'input_shape': (299, 299, 3),
    'batch_size': 16,
    # 한 번 split하여 generate 한 데이터 셋을 학습할 횟수
    'epochs_per_generator': 5,
    # Batch를 불러올 때 Multiprocessing을 사용합니다.
    # 라이젠 등 코어가 많은 CPU 환경에서 많은 성능 향상을 가져올 수 있습니다.
    'nb_workers': multiprocessing.cpu_count() // 2
}

In [3]:
# 혹 다른 데이터 셋 추가(Pretrained Model Weights)로 인해 PATH가 변경된다면 아래 PATH를 수정
DATA_PATH = 'E:\\data\\2019-3rd-ml-month-with-kakr'
os.listdir(DATA_PATH)

['class.csv',
 'cropped_test',
 'cropped_train',
 'inception_resnet_v2-0.86695',
 'sample_submission.csv',
 'submission.csv',
 'test',
 'test.csv',
 'train',
 'train.csv',
 'xception_ep001_vloss-5.2130_vacc-0.0105.h5',
 'xception_ep007_vloss-8.2031_vacc-0.0115.h5',
 'xception_ep008_vloss-4.5204_vacc-0.0329.h5',
 'xception_ep009_vloss-4.0916_vacc-0.0634.h5',
 'xception_ep010_vloss-3.6161_vacc-0.1163.h5',
 'xception_ep011_vloss-2.9675_vacc-0.2111.h5',
 'xception_ep012_vloss-2.7274_vacc-0.3089.h5',
 'xception_ep013_vloss-2.0129_vacc-0.4441.h5',
 'xception_ep014_vloss-1.9834_vacc-0.4805.h5',
 'xception_ep015_vloss-1.4812_vacc-0.5828.h5',
 'xception_ep016_vloss-0.9499_vacc-0.7101.h5',
 'xception_ep017_vloss-0.9059_vacc-0.7146.h5',
 'xception_ep018_vloss-0.9484_vacc-0.7325.h5',
 'xception_ep020_vloss-0.7427_vacc-0.7725.h5',
 'xception_ep021_vloss-0.4973_vacc-0.8568.h5',
 'xception_ep025_vloss-0.5598_vacc-0.8463.h5']

In [4]:
# 이미지 폴더 경로
TRAIN_IMG_PATH = os.path.join(DATA_PATH, 'cropped_train')
TEST_IMG_PATH = os.path.join(DATA_PATH, 'cropped_test')

# CSV 파일 경로
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df_test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
df_class = pd.read_csv(os.path.join(DATA_PATH, 'class.csv'))

In [5]:
# class를 정수 값에서 문자열로 치환합니다.
df_train['class'] = df_train['class'].astype('str')

df_train = df_train[['img_file', 'class']]
df_test = df_test[['img_file']]

In [6]:
from sklearn.model_selection import train_test_split

def train_val_split(params, df_train):
    # training, validation set을
    # 8:2비율로 Random한 index들로 split합니다. 
    
    its = np.arange(df_train.shape[0])
    train_idx, val_idx = train_test_split(its, train_size=0.8, random_state=params['random_state'])

    X_train = df_train.iloc[train_idx, :]
    X_val = df_train.iloc[val_idx, :]
    
    # 다음 번 split을 위해 random state 값을 변경합니다.
    # random state 값을 지정한 이유는, 지정한 random state를 통해 index를 매번 갖게 split할 수 있어
    # 디버깅에 용이하기 때문입니다.
    
    return X_train, X_val


def train_val_split_fixed(params, df_train, i, n):
    # training, validation set을
    # 8:2비율로 고정된 index들로 split합니다.
    # 50 epochs 기준으로
    # 모든 데이터를 반드시 4번씩 training 하게 되고,
    # 모든 데이터는 반드시 1번씩 validaton 하게 됩니다.
    
    its = list(range(df_train.shape[0]))
    chunks = [its[j::n] for j in range(n)]
    
    i %= n
    
    X_val = df_train.iloc[chunks[i], :]
    chunks.pop(i)
    X_train = df_train.iloc[sum(chunks, []), :]
    
    return X_train, X_val

In [7]:
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import ImageDataGenerator

def make_generator(params, X_train, X_val):
    # Define Generator config
    train_datagen = ImageDataGenerator(
        rotation_range=params['rotation_range'],
        width_shift_range=params['width_shift_range'],
        height_shift_range=params['height_shift_range'],
        shear_range=params['shear_range'],
        zoom_range=params['zoom_range'],
        horizontal_flip=params['horizontal_flip'],
        brightness_range=params['brightness_range'],
        preprocessing_function=preprocess_input)

    val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

    # Make Generator
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=X_train, 
        directory=TRAIN_IMG_PATH,
        x_col='img_file',
        y_col='class',
        target_size=params['img_size'],
        color_mode='rgb',
        class_mode='categorical',
        batch_size=params['batch_size'],
        seed=params['random_state']
    )

    validation_generator = val_datagen.flow_from_dataframe(
        dataframe=X_val, 
        directory=TRAIN_IMG_PATH,
        x_col='img_file',
        y_col='class',
        target_size=params['img_size'],
        color_mode='rgb',
        class_mode='categorical',
        batch_size=params['batch_size'],
        shuffle=False
    )
    return train_generator, validation_generator

In [8]:
def get_steps(num_samples, batch_size):
    if (num_samples % batch_size) > 0 :
        return (num_samples // batch_size) + 1
    else:
        return num_samples // batch_size

In [9]:
from keras.models import Sequential
from keras.layers import Dense, GlobalAveragePooling2D

cnn_model = Xception(include_top=False, input_shape=params['input_shape'])
model = Sequential()
model.add(cnn_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(196, activation='softmax', kernel_initializer='he_normal'))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
xception (Model)             (None, 10, 10, 2048)      20861480  
_________________________________________________________________
global_average_pooling2d_1 ( (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 196)               401604    
Total params: 21,263,084
Trainable params: 21,208,556
Non-trainable params: 54,528
_________________________________________________________________


In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
histories = []

In [None]:
%%time
from keras.callbacks import ModelCheckpoint, EarlyStopping

# checkpoint에서 모델을 저장할 path
filepath = os.path.join(DATA_PATH, 'xception_ep{epoch:03d}_vloss-{val_loss:.4f}_vacc-{val_acc:.4f}.h5')

# 학습을 이어서 할 경우, model filename만 지정해주면 됩니다.
from keras.models import load_model
model_filename = 'xception_ep025_vloss-0.5598_vacc-0.8463.h5'
model = load_model(os.path.join(DATA_PATH, model_filename))

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
callbacks = [checkpoint]

# 최대 학습 횟수
epochs = 50

for i in range(5, epochs // params['epochs_per_generator']):
    print(params)
    
    X_train, X_val = train_val_split(params, df_train)
    train_generator, validation_generator = make_generator(params, X_train, X_val)
    
    params.update({
        'nb_train_samples': len(X_train),
        'nb_validation_samples': len(X_val)
    })
    
    histories.append(
        model.fit_generator(
            train_generator,
            steps_per_epoch=get_steps(params['nb_train_samples'], params['batch_size']),
            # 한번 generate 된 데이터를 학습할 횟수
            epochs=params['epochs_per_generator'] * (i + 1),
            validation_data=validation_generator,
            validation_steps=get_steps(params['nb_validation_samples'], params['batch_size']),
            callbacks=callbacks,
            workers=params['nb_workers'],
            initial_epoch=params['epochs_per_generator'] * i
        )
    )

save_model_filename = 'xception_ep' + '{0:03d}'.format(epochs) + '_vloss-' +\
    str(round(histories[-1].history['val_loss'][-1], 4)) + '_vacc-' + str(round(histories[-1].history['val_acc'][-1], 4)) + '.h5'
model.save(os.path.join(DATA_PATH, save_model_filename))
gc.collect()

Instructions for updating:
Use tf.cast instead.
{'random_state': 52, 'rotation_range': 10, 'width_shift_range': 0.2, 'height_shift_range': 0.2, 'shear_range': 0.5, 'zoom_range': 0.2, 'horizontal_flip': True, 'brightness_range': (0.7, 1.5), 'img_size': (299, 299), 'input_shape': (299, 299, 3), 'batch_size': 16, 'epochs_per_generator': 5, 'nb_workers': 6}
Found 8012 images belonging to 196 classes.
Found 2004 images belonging to 196 classes.
Epoch 26/30

In [15]:
%%time
from keras.models import load_model

params.update({
    'nb_test_samples': len(df_test)
})

model_filename = 'xception_ep021_vloss-0.4973_vacc-0.8568.h5'
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory=TEST_IMG_PATH,
    x_col='img_file',
    y_col=None,
    target_size=params['img_size'],
    color_mode='rgb',
    class_mode=None,
    batch_size=params['batch_size'],
    shuffle=False)

model = load_model(os.path.join(DATA_PATH, model_filename))

prediction = model.predict_generator(
    generator = test_generator,
    steps = get_steps(params['nb_test_samples'], params['batch_size']),
    verbose=1,
    workers=params['nb_workers']
)

Found 6169 images.
Wall time: 1min 33s


In [16]:
import os

predicted_class_indices = np.argmax(prediction, axis=1)

X_train, X_val = train_val_split(params, df_train)
train_generator, _ = make_generator(params, X_train, X_val)

# Generator class dictionary mapping
labels = (train_generator.class_indices)
labels = dict((v, k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

submission = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))
submission['class'] = predictions
submission.to_csv(os.path.join(DATA_PATH, os.path.splitext(model_filename)[0] + '.csv'), index=False)
submission.head()

Found 8012 images belonging to 196 classes.
Found 2004 images belonging to 196 classes.


Unnamed: 0,img_file,class
0,test_00001.jpg,124
1,test_00002.jpg,98
2,test_00003.jpg,157
3,test_00004.jpg,94
4,test_00005.jpg,16


#### **Reference:**
https://medium.com/@vijayabhaskar96/tutorial-on-keras-flow-from-dataframe-1fd4493d237c  
https://keras.io/  
http://www.arxiv.org/abs/1512.03385  
https://pillow.readthedocs.io/en/stable/  
https://www.kaggle.com/guglielmocamporese/macro-f1-score-keras