## 4.1.6 재현 신경망(Recurrent Neural Network) 분류 모델

단, 입력 문장을 순차적으로 입력만 하고 마지막으로 입력한 시점에 출력 정보를 뽑아 영화 평점을 예측하고자 한다.

: activation fuction으로 tanh을 쓴다. 왜 hidden state에서 활성함수를 쓸까? -> 각 hidden state 마다 output을 내기때문 아닐까?

- 시퀀스 data -> one2many, many2one, many2many -> gradient vanishing 문제 -> LSTM

### LSTM

: 정보 손실을 막기 위해 메모리층을 갖고 있음 -> backpro를 진행해도 gradient vanishing 문제 해결

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split # -> 0.25 : 0.75로 자동 분할

import os
import json

## 학습 데이터 파일 로드

1. 딥러닝 input : 인덱스 (word2vec으로 만들 수 있음)
2. embedding layer 만들기: index to 문장 vector(벡터화된 단어를 붙여 문장 vector를 만듬)
    - by lookup_table(신경망): 학습시 자동생성 or word2vec에서 로드할 수 있음(더 빠름)
3. 딥러닝 모델

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'

TRAIN_INPUT_DATA = 'train_input.npy'  # 왜 csv가 아니라 npy를 사용? -> tf-idf 등으로 임베딩하지 않고 index를 그대로 사용(index: 1,7,... / 임베딩: [0,1,0,0,0,0,0,0,0])
TRAIN_LABEL_DATA = 'train_label.npy'
DATA_CONFIGS = 'data_configs.json'

In [3]:
input_data = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS, 'r') as f:
    prepro_configs = json.load(f)

In [4]:
TEST_SPLIT = 0.1
RANDOM_SEED = 13371447

train_input, test_input, train_label, test_label = train_test_split(input_data, label_data, 
                                                                    test_size=TEST_SPLIT, random_state=RANDOM_SEED)

## 데이터 입력함수

In [5]:
BATCH_SIZE = 16
NUM_EPOCHS = 3

def mapping_fn(X, Y):
    inputs, labels = {'x': X}, Y
    return inputs, labels

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((train_input, train_label))
    dataset = dataset.shuffle(buffer_size=50000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    dataset = dataset.map(mapping_fn)   # gpu 병목현상을 줄이기 위해?
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_input, test_label))
    dataset = dataset.map(mapping_fn)
    dataset = dataset.batch(BATCH_SIZE * 2)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

## RNN 모델 정의

In [6]:
VOCAB_SIZE = prepro_configs['vocab_size']+1
WORD_EMBEDDING_DIM = 100
HIDDEN_STATE_DIM = 150
DENSE_FEATURE_DIM = 150

learning_rate = 0.001

In [7]:
print(len(prepro_configs['vocab']), VOCAB_SIZE)

74065 74066


In [8]:
def model_fn(features, labels, mode):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
                                                # 단어 사이즈(row), 단어 임베딩 디멘션(col)
    embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, WORD_EMBEDDING_DIM)(features['x'])  # tf.keras.layers.Embedding: 자연어처리 핵심함수
    
    embedding_layer = tf.keras.layers.Dropout(0.2)(embedding_layer)
    
    # 순환신경망 구현
    rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
    # 여러 LSTM 쌓기 (wrapping)
    multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)

    # 네트워크와 임베딩 벡터를 연산, state: 중간 값(hidden->)
    outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                       inputs=embedding_layer,
                                       dtype=tf.float32)
    
    outputs = tf.keras.layers.Dropout(0.2)(outputs)  # outputs[batch_size,time_step,result(output↑)]
    hidden_layer = tf.keras.layers.Dense(DENSE_FEATURE_DIM, activation=tf.nn.tanh)(outputs[:,-1,:]) # time_step: 각각의 index의 중간 time_step 값을 무시하고 마지막 결과값을 사용한다.
    hidden_layer = tf.keras.layers.Dropout(0.2)(hidden_layer)
    logits = tf.keras.layers.Dense(1)(hidden_layer)
    logits = tf.squeeze(logits, axis=-1)
    
    sigmoid_logits = tf.nn.sigmoid(logits)
    
    if PREDICT: # 예측구현
        predictions = {'sentiment': sigmoid_logits}  # 출력값 [0,1]
        
        return tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=predictions)
    
    loss = tf.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        accuracy = tf.metrics.accuracy(labels, tf.round(sigmoid_logits))
        eval_metric_ops = {'acc': accuracy}

        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    
    if TRAIN:
        global_step = tf.train.get_global_step()
        train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step) # 아담 옵티마이저로 모델 파라미터 최적화

        return tf.estimator.EstimatorSpec(mode=mode,
                                          train_op=train_op,
                                          loss=loss)

In [9]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

# estimator 객체 생성
est = tf.estimator.Estimator(model_fn=model_fn,
                             model_dir=DATA_OUT_PATH + 'checkpoint/rnn') 

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data_out/checkpoint/rnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001B7A20EECC0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [1]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"    # tensorflow gpu 버전이 아님

est.train(train_input_fn)  
# calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
# Instructions for updating:
# Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

In [2]:
est.evaluate(eval_input_fn)    # INFO:tensorflow:Saving dict for global step 4221: acc = 0.8484, global_step = 4221, loss = 0.35004097

## 캐글 평가 데이터셋 만들기

In [12]:
DATA_OUT_PATH = './data_out/'
TEST_INPUT_DATA = 'test_input.npy'
TEST_ID_DATA = 'test_id.npy'

test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))

In [13]:
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x":test_input_data}, shuffle=False)

In [3]:
predictions = np.array([p['sentiment'] for p in est.predict(input_fn=predict_input_fn)])

In [15]:
test_id = np.load(open(DATA_IN_PATH + TEST_ID_DATA, 'rb'))

In [16]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

output = pd.DataFrame(data={"id": list(test_id), "sentiment":list(predictions)} )
output.to_csv(DATA_OUT_PATH + 'movie_review_result_rnn.csv', index=False, quoting=3 )