In [2]:
# 1. Setup

import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

In [3]:
# 그래프 표현을 위한 matplotlib import & 함수 plot_graphs(history, metric) 생성

import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [4]:
# 2. Setup input pipeline

# IMDB 거대 영화 리뷰 데이터셋은 binary classidication dataset이다. 즉, 모든 리뷰들은 'positive' 또는 'negative' 감정 중 하나이다.
# TFDS(TensorFlow DataSets)를 이용해서 데이터셋을 다운 받는다. 

dataset, info = tfds.load('imdb_reviews', with_info = True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteY5RIS8/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteY5RIS8/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteY5RIS8/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [5]:
# return 결과 : (text, label) pairs의 데이터셋

for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [6]:
# training을 위해 데이터를 shuffle한다

BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [7]:
# 위의 (text, label) pairs의 batches를 생성한다

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [8]:
# train dataset 부분 출력해보기

for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'The Sea Is Watching has been made from an original Akira Kurosawa script, and it is indeed a lush and warm film. Watching it will be a pleasure !<br /><br />Kei Kumai as director is certainly no equal to the old but everlasting master (particularly the mass scenes in the beginning of the film has some terrible acting), but the overall mood and scenery is very enjoyable. Another thing that is missed here: Kurosawa always managed to let the characters be so much more then what they are actually showing and doing.<br /><br />Probably that was his magic on set while shooting; and just maybe this script was not fully up to par yet.<br /><br />Maybe we just miss the eye of the master.<br /><br />This is one lovely and sweet film, but it is no Kurosawa. To expect that might well be very silly...'
 b'When evaluating documentaries that focus a relatively small group of Ugly ultra right wing and conservative groups like this in the USA you must consider the following. The United State

In [9]:
# 3. text tokenizer 생성

# tfds로부터 로드된 raw text는 모델에 실제로 넣어지기 전에 우선 처리되어야 할 게 있다. 
# training을 위한 텍스트 처리의 가장 쉬운 방법은 'TextVectorization' layer를 사용하는 것이다. 
# 이 layer는 많은 기능을 가졌지만, 이 튜토리얼에서는 가장 기본적인 것만 적용해보았다.

# TextVectorization layer 생성 & layer의 .adapt() 메소드에 데이터셋의 텍스트 넘겨주기
# .adapt() 메소드 => layer의 단어 설정

VOCAB_SIZE = 1000
tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
tokenizer.adapt(train_dataset.map(lambda text, label: text))

In [10]:
# layer의 단어 중 첫 20개 token -- 패딩과 알 수 없는 토큰들 후에 이 token들은 빈도(frequency)에 의해 정렬될 것.

vocab = np.array(tokenizer.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [11]:
# 단어가 설정되고 나면, layer는 text를 인덱스로 tokenize할 수 있다
# batch에서 가장 긴 문장을 기준으로 인덱스가 zero-padded 된다 (output_sequence_length를 설정해주지 않은 경우에 한해)

tokenized_example = tokenizer(example)[:3].numpy()
tokenized_example

array([[  2,   1,   7, ...,   0,   0,   0],
       [ 51,   1,   1, ...,   0,   0,   0],
       [842,   1,   7, ...,   0,   0,   0]])

In [12]:
# 기본 설정에서는 프로세스를 완전히 되돌릴 수 없음
# 이유 - 3가지 :
  # 1) preprocessing.TextVectorization의 standardize argument를 위한 기본값이 "lower_and_strip_punction"임
  # 2) 제한된 어휘 크기와 문자 기반 폴백 부족으로 인해 일부 알 수 없는 토큰이 발생함

for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[tokenized_example[n]]))
  print()

Original:  b'The Sea Is Watching has been made from an original Akira Kurosawa script, and it is indeed a lush and warm film. Watching it will be a pleasure !<br /><br />Kei Kumai as director is certainly no equal to the old but everlasting master (particularly the mass scenes in the beginning of the film has some terrible acting), but the overall mood and scenery is very enjoyable. Another thing that is missed here: Kurosawa always managed to let the characters be so much more then what they are actually showing and doing.<br /><br />Probably that was his magic on set while shooting; and just maybe this script was not fully up to par yet.<br /><br />Maybe we just miss the eye of the master.<br /><br />This is one lovely and sweet film, but it is no Kurosawa. To expect that might well be very silly...'
Round-trip:  the [UNK] is watching has been made from an original [UNK] [UNK] script and it is indeed a [UNK] and [UNK] film watching it will be a [UNK] br br [UNK] [UNK] as director is 

In [13]:
# 4. model 생성

# tf.keras.Sequential : sequential data의 처리
# ① encoder(+word tokenizer) : 텍스트 -> 토큰 인덱스
# ② embedding : word 인덱스 -> vector -> training => vector에 유사도 반영됨 (효율적)
# ③ RNN에 넣고 학습시켜 language model 생성 -- timestep마다 이전 timestep에서의 output을 이번 timestep의 input으로 넣어주며 수행
# ④ 하나의 context vector로 압축되어 나옴
# ⑤ 두 Dense layer에서 final processing을 거쳐, classification의 결과로 하나의 logit으로 변환됨

model = tf.keras.Sequential([
    tokenizer,
    tf.keras.layers.Embedding(
        input_dim=len(tokenizer.get_vocabulary()),
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [14]:
# embedding layer의 masking 작업
  # 다양한 문장 길이를 handle하기 위함

print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [15]:
# 기대한 대로 작동하는지 확인하기 위해 문장 evaluation 2번 수행 
  # 이때, evaluation의 결과는 동일해야 한다

# 1) "without" padding
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.01043344]


In [16]:
# 2) "with" padding

padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[-0.01043343]


In [17]:
# model 컴파일

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [18]:
# 5. model training

history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
 16/391 [>.............................] - ETA: 6:10 - loss: 0.5459 - accuracy: 0.6914

KeyboardInterrupt: ignored

In [None]:
# test loss & test accuracy 출력

test_loss, test_acc = model.evaluate(test_dataset)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
# loss & accuracy에 대한 그래프 출력

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

In [None]:
# prediction(=inference) 수행
  # prediction 결과: >=0.0 => 'positive' / <0.0 => 'negative'
  # downstream task(실제 수행하고자 하는 task들)에 맞게끔 fine tunning 해주면 됨

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions)

# **<코드 실행 결과 보고>**

**Bidirectional LSTM과 Unidirectional LSTM 모두에서 accuracy는 유의미하게 급증하고, 그에 반해 loss는 감소했기 때문에 둘 다 성공적인 training이 이뤄졌다고 할 수 있다.
한편, bidirectional LSTM으로 prediction을 수행했을 때 그 결과값이 0.78051335으로, 0.635655인 unidirectional LSTM보다 더 높게 나온 것으로 보아 본 튜토리얼에서는 bidirectional LSTM이 더 효과적인 language model임을 알 수 있다.**