# 모델링

In [1]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'  # numpy file
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

## 데이터셋 분류 및 하이퍼파라미터 정의

In [3]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
EMB_SIZE = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

## 데이터 입력함수

In [4]:
def mapping_fn(X, Y):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

# CNN 모델함수

: 여기서는 깊게 쌓지 않았다. (빠른학습을 위해), 영어 리뷰데이터보다 짧은 길이의 문장

In [5]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE,EMB_SIZE)(features['x'])

    dropout_emb = tf.keras.layers.Dropout(rate = 0.2)(embedding_layer)
    
    conv = tf.layers.conv1d(
           inputs=dropout_emb,
           filters=32,
           kernel_size=3,
           padding='same',
           activation=tf.nn.relu)
  
    pool = tf.keras.layers.GlobalMaxPool1D()(conv)

    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu)(pool)   


    dropout_hidden = tf.keras.layers.Dropout(rate=0.2)(hidden, training = TRAIN)
    logits = tf.keras.layers.Dense(units=1)(dropout_hidden)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

## 모델 학습 및 검증 with estimator

In [6]:
est = tf.estimator.Estimator(model_fn, model_dir="data_out/checkpoint/cnn_model")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'data_out/checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000111BB6F9C18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [1]:
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

In [8]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16T09:19:05Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/cnn_model\model.ckpt-8438
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-16-09:19:07
INFO:tensorflow:Saving dict for global step 8438: acc = 0.82526666, global_step = 8438, loss = 0.39044076
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 8438: data_out/checkpoint/cnn_model\model.ckpt-8438


## 모델 평가

: label이 존재함 -> 직접 평가 가능

In [9]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [10]:
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_input_data, test_label_data))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [11]:
predict = est.evaluate(test_input_fn) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16T09:19:08Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/cnn_model\model.ckpt-8438
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-16-09:19:12
INFO:tensorflow:Saving dict for global step 8438: acc = 0.82322, global_step = 8438, loss = 0.39584005
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 8438: data_out/checkpoint/cnn_model\model.ckpt-8438


## 추가) 불용어를 제외하지 않는 경우

In [12]:
# DATA_IN_PATH = './data_in/'
# DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'nsmc_train_input2.npy'
# LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs2.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [13]:
# TEST_SPLIT = 0.1
# RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
# EMB_SIZE = 128
# BATCH_SIZE = 16
# NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [14]:
est = tf.estimator.Estimator(model_fn, model_dir="data_out/checkpoint/cnn_model")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'data_out/checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000111C1004B00>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [15]:
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

Experiment started at 11:00:33
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/cnn_model\model.ckpt-8438
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 8438 into data_out/checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:loss = 0.8431592, step = 8439
INFO:tensorflow:global_step/sec: 10.5372
INFO:tensorflow:loss = 0.68922603, step = 8539 (9.496 sec)
INFO:tensorflow:global_step/sec: 12.0617
INFO:tensorflow:loss = 0.64566255, step = 8639 (8.291 sec)
INFO:tensorflow:global_step/sec: 12.0594
INFO:tensorflow:loss = 0.62867254, step = 8739 (8.289 sec)
INFO:tensorflow:global_step/sec: 12.102
INFO:tensorflow:loss = 0.61643004, step = 8839 (8.265 sec)
INFO:tensorflow:global_step/sec: 12.0974
INFO:tensorflow:loss = 0.56

INFO:tensorflow:global_step/sec: 12.0231
INFO:tensorflow:loss = 0.29763907, step = 16139 (8.318 sec)
INFO:tensorflow:global_step/sec: 12.0832
INFO:tensorflow:loss = 0.44617355, step = 16239 (8.274 sec)
INFO:tensorflow:global_step/sec: 12.0821
INFO:tensorflow:loss = 0.288944, step = 16339 (8.277 sec)
INFO:tensorflow:global_step/sec: 12.1134
INFO:tensorflow:loss = 0.5931164, step = 16439 (8.254 sec)
INFO:tensorflow:global_step/sec: 12.2116
INFO:tensorflow:loss = 0.33334494, step = 16539 (8.187 sec)
INFO:tensorflow:global_step/sec: 12.0525
INFO:tensorflow:loss = 0.3811377, step = 16639 (8.301 sec)
INFO:tensorflow:global_step/sec: 12.0229
INFO:tensorflow:loss = 0.38207665, step = 16739 (8.315 sec)
INFO:tensorflow:global_step/sec: 11.7361
INFO:tensorflow:loss = 0.33402896, step = 16839 (8.524 sec)
INFO:tensorflow:Saving checkpoints for 16876 into data_out/checkpoint/cnn_model\model.ckpt.
INFO:tensorflow:Loss for final step: 0.05900701.
.......................................
Experiment fini

In [16]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16T11:12:34Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/cnn_model\model.ckpt-16876
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-16-11:12:37
INFO:tensorflow:Saving dict for global step 16876: acc = 0.81986666, global_step = 16876, loss = 0.39604995
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 16876: data_out/checkpoint/cnn_model\model.ckpt-16876


In [17]:
INPUT_TEST_DATA = 'nsmc_test_input2.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [18]:
predict = est.evaluate(test_input_fn) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16T11:12:37Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data_out/checkpoint/cnn_model\model.ckpt-16876
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-16-11:12:44
INFO:tensorflow:Saving dict for global step 16876: acc = 0.81646, global_step = 16876, loss = 0.40052652
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 16876: data_out/checkpoint/cnn_model\model.ckpt-16876


In [None]:
# acc = 0.82322, global_step = 8438, loss = 0.39584005  # 불용어 제거 후가 더 높은 정확도를 보임

## RNN 모델

: rnn을 이용하여 더 깊은 층을 사용한 정확도를 얻고자 한다.

In [20]:
def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, WORD_EMBEDDING_DIM)(features['x'])
    
    embedding_layer = tf.keras.layers.Dropout(0.2)(embedding_layer)
    
    # 순환신경망 구현
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]  # -> keras.layers.LSTM(cell)으로 대체
    # 여러 LSTM 쌓기 (wrapping)
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)  # -> tf.keras.layers.StackedRNNCells

    # 네트워크와 임베딩 벡터를 연산
    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                       inputs=embedding_layer,
                                       dtype=tf.float32)
    
    outputs = tf.keras.layers.Dropout(0.2)(outputs)
    hidden_layer = tf.keras.layers.Dense(DENSE_FEATURE_DIM, activation=tf.nn.tanh)(outputs[:,-1,:])
    hidden_layer = tf.keras.layers.Dropout(0.2)(hidden_layer)
    logits = tf.keras.layers.Dense(1)(hidden_layer)
    logits = tf.squeeze(logits, axis=-1)
    
    sigmoid_logits = tf.nn.sigmoid(logits)
    
    if PREDICT: # 예측구현
        predictions = {'sentiment': sigmoid_logits}  # 출력값 [0,1]
        
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions)
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(sigmoid_logits))
        eval_metric_ops = {'acc': accuracy}

        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step) # 아담 옵티마이저로 모델 파라미터 최적화

        return tf.estimator.EstimatorSpec(mode=mode,
                                          train_op=train_op,
                                          loss=loss)

In [21]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

# estimator 객체 생성(rnn)
est = tf.estimator.Estimator(model_fn=model_fn,
                             model_dir=DATA_OUT_PATH + 'checkpoint/rnn') 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data_out/checkpoint/rnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000111DF0C2710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [22]:
# DATA_IN_PATH = './data_in/'
# DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
# LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

VOCAB_SIZE = prepro_configs['vocab_size']+1
WORD_EMBEDDING_DIM = 100
HIDDEN_STATE_DIM = 150
DENSE_FEATURE_DIM = 150

learning_rate = 0.001

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [23]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"

est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint/rnn\model.ckpt-4221
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 4221 into ./data_out/checkpoint/rnn\model.ckpt.
INFO:tensorflow:loss = 0.90629137, step = 4222
INFO:tensorflow:global_step/sec: 9.13409
INFO:tensorflow:loss = 0.6931577, step = 4322 (10.953 sec)
INFO:tensorflow:global_step/sec: 11.1253
INFO:tensorflow:loss = 0.6609422, step = 44

INFO:tensorflow:global_step/sec: 8.56418
INFO:tensorflow:loss = 0.34069985, step = 10922 (11.671 sec)
INFO:tensorflow:global_step/sec: 14.7539
INFO:tensorflow:loss = 0.7622855, step = 11022 (6.785 sec)
INFO:tensorflow:global_step/sec: 11.5125
INFO:tensorflow:loss = 0.1508047, step = 11122 (8.680 sec)
INFO:tensorflow:global_step/sec: 10.7234
INFO:tensorflow:loss = 0.5298122, step = 11222 (9.331 sec)
INFO:tensorflow:global_step/sec: 10.8844
INFO:tensorflow:loss = 0.1940091, step = 11322 (9.182 sec)
INFO:tensorflow:global_step/sec: 10.3889
INFO:tensorflow:loss = 0.38229898, step = 11422 (9.625 sec)
INFO:tensorflow:global_step/sec: 10.4466
INFO:tensorflow:loss = 0.57367134, step = 11522 (9.573 sec)
INFO:tensorflow:global_step/sec: 10.7051
INFO:tensorflow:loss = 0.4815543, step = 11622 (9.342 sec)
INFO:tensorflow:global_step/sec: 10.2962
INFO:tensorflow:loss = 0.16860868, step = 11722 (9.713 sec)
INFO:tensorflow:global_step/sec: 11.3642
INFO:tensorflow:loss = 0.42452198, step = 11822 (8.801

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x111df04d7b8>

In [24]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [25]:
predict = est.evaluate(test_input_fn) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-16T11:35:13Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint/rnn\model.ckpt-12659
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-16-11:35:37
INFO:tensorflow:Saving dict for global step 12659: acc = 0.8261, global_step = 12659, loss = 0.3841173
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 12659: ./data_out/checkpoint/rnn\model.ckpt-12659


In [None]:
# acc = 0.82322, global_step = 8438, loss = 0.39584005  # 불용어 제거 후 & 단층 CNN 모델이 더 낮은 정확도를 보임