In [5]:
import os
import tempfile
import tensorflow as tf

from tensorflow.python.keras.datasets import imdb #imdb dataset
from tensorflow.python.keras._impl.keras.preprocessing import sequence
from tensorflow.python.keras._impl.keras import utils
from tensorboard import summary as summary_lib

import numpy as np

tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)

1.8.0


In [6]:
# https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/nlp_estimators.ipynb
# https://github.com/eisenjulian/nlp_estimator_tutorial/blob/master/LICENSE
# https://github.com/aymericdamien/TensorFlow-Examples

vocab_size = 5000
sentence_size = 200
embedding_size = 50
model_dir = tempfile.mkdtemp()

NUM_EPOCHS = 100
BATCH_SIZE = 128

pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(num_words=vocab_size,
                                                      start_char=start_id,
                                                      oov_char=oov_id,
                                                      index_from=index_offset)

print("x_train shape", x_train_variable.shape)
print("x_test shape", x_test_variable.shape)

print("Pad sequences (samples x time)")

x_train = sequence.pad_sequences(x_train_variable, 
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
x_test = sequence.pad_sequences(x_test_variable, 
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post', 
                                value=pad_id)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable])
x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable])

x_train shape (25000,)
x_test shape (25000,)
Pad sequences (samples x time)
x_train shape: (25000, 200)
x_test shape: (25000, 200)


In [7]:
word_index = imdb.get_word_index()
word_inverted_index = {v + index_offset: k for k, v in word_index.items()}

# The first indexes in the map are reserved to represent things other than tokens
word_inverted_index[pad_id] = '<PAD>'
word_inverted_index[start_id] = '<START>'
word_inverted_index[oov_id] = '<OOV>'

for i in range(0, 10):
    print(i, word_inverted_index[i])
    
def index_to_text(indexes):
    return ' '.join([word_inverted_index[i] for i in indexes])

print(index_to_text(x_train_variable[0]))

0 <PAD>
1 <START>
2 <OOV>
3 the
4 and
5 a
6 of
7 to
8 is
9 br
<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <OOV> is an amazing actor and now the same being director <OOV> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <OOV> and would recommend it to everyone to watch and the fly <OOV> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <OOV> to the two little <OOV> that played the <OOV> of norman and paul they were just brilliant children are often left out of the <OOV> list i think because the stars that play them all grown up are such a big <OOV> for the whole film bu

In [10]:
def parser(input_sent, length, label):
    features = {"input_sent": input_sent, "len": length}
    return features, label

#the former (repeat before shuffle) provides better performance, 
# while the latter (shuffle before repeat) provides stronger ordering guarantees.

def train_input_fn():
    with tf.device('/cpu:0'):
        dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
        dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(10000, NUM_EPOCHS))
        dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func=parser, batch_size=BATCH_SIZE))
        dataset = dataset.apply(tf.contrib.data.prefetch_to_device("/cpu:0")) #디바이스에 미리 1개씩 대기
        # dataset = dataset.prefetch(1) # prefetch가 오래 걸리면 10으로 세팅 하는것도 방법임

        iterator = dataset.make_one_shot_iterator()
        return iterator.get_next()
    
def eval_input_fn():
    with tf.device('/cpu:0'):
        dataset = tf.data.Dataset.from_tensor_slices((x_test, x_len_test, y_test))        
        dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func=parser, batch_size=BATCH_SIZE))
        dataset = dataset.apply(tf.contrib.data.prefetch_to_device("/cpu:0")) #디바이스에 미리 1개씩 대기
        # dataset = dataset.prefetch(1) # prefetch가 오래 걸리면 10으로 세팅 하는것도 방법임

        iterator = dataset.make_one_shot_iterator()
        return iterator.get_next()

In [11]:
head = tf.contrib.estimator.binary_classification_head()

def layer_test(features, labels, mode, params):
    
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT
    
    input_layer = tf.contrib.layers.embed_sequence(
                                                features['input_sent'],
                                                vocab_size,
                                                embedding_size,
                                                initializer=params['embed_init']
                                                )
        
    conv_layer = tf.layers.conv1d(inputs=input_layer,
                                 filters=32, # Filters는 얼마나 다른 windows를 보유하냐? 를 의미합니다.
                                 kernel_size=3, # Kernel_size는 슬라이딩 윈도우의 사이즈를 결정합니다.3grams
                                 padding="same",
                                 activation=tf.nn.relu)
    
    ## filters = 100이고 kernel_size = 4이면, 길이가 4인 100개의 다른 필터를 생성합니다. 100개의 컨볼루션들을 생성합니다.
    ## conv1d는 (배치사이즈, 길이, 채널)로 입력값을 받는데, 배치사이즈: 문장 숫자 | 길이: 각 문장의 단어의 개수 | 채널: 임베딩 출력 차원수
    
    pool_layer = tf.reduce_max(input_tensor=conv_layer, axis=1)
    
    dense_layer = tf.layers.dense(inputs=pool_layer, units=128, activation=tf.nn.relu)
    
    output_layer = tf.layers.dense(inputs=dense_layer, units=1)
    
    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
    
    optimizer = tf.train.AdamOptimizer()
    
    def _train_op(loss):
        return optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
    
    return head.create_estimator_spec(
            features=features,
            labels=labels,
            mode=mode,
            logits=output_layer,
            train_op_fn=_train_op)
    
#         global_step = tf.train.get_global_step(tf.nn.sparse_softmax_cross_entropy_with_logits(
#                         logits=output_layer, labels=labels))

In [12]:
params = {'embed_init': tf.random_uniform_initializer(-1.0, 1.0)}

cnn_classifier = tf.estimator.Estimator(model_fn = layer_test,
                                       model_dir='./data/cnn',
                                       params=params)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data/cnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11f116390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [13]:
def train_and_evaluate(classifier):
    classifier.train(input_fn=train_input_fn, steps=100)
    eval_result = classifier.evaluate(input_fn=eval_input_fn)
    
# #     # Reset the graph to be able to reuse name scopes
# #     tf.reset_default_graph() 
# #     # Add a PR summary in addition to the summaries that the classifier writes
# #     pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool), num_thresholds=21)
# #     with tf.Session() as sess:
# #         writer = tf.summary.FileWriter(os.path.join(classifier.model_dir, 'eval'), sess.graph)
# #         writer.add_summary(sess.run(pr), global_step=0)
# #         writer.close()

train_and_evaluate(cnn_classifier)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./data/cnn/model.ckpt.
INFO:tensorflow:loss = 0.73147726, step = 1
INFO:tensorflow:Saving checkpoints for 100 into ./data/cnn/model.ckpt.
INFO:tensorflow:Loss for final step: 0.5987458.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-25-14:51:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data/cnn/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-25-14:51:27
INFO:tensorflow:Saving dict for global step 100: accuracy = 0.66772, accuracy_baseline = 0.5, auc = 0.7398737, auc_precision_recall = 0.7246644, average