#**LSTM을 이용한 셰익스피어 스타일의 글 생성**

In [3]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 20

<IPython.core.display.Javascript object>

In [5]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [6]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [7]:
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [8]:
print(repr(text[:200]))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you'


In [9]:
# 총 문장의 길이
len(text)

1115394

In [10]:
# 데이터가 너무 커서 세션이 다운되므로, 데이터 사이즈 줄이기
text = text[:500000]

In [11]:
len(text)

500000

## 텍스트 전처리

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) +1
print('단어 집합의 크기: %d' % vocab_size)

단어 집합의 크기: 8244


In [13]:
print(tokenizer.word_index)



In [14]:
sequences = list()

In [15]:
for line in text.split('\n'):
  encoded = tokenizer.texts_to_sequences([line])[0]
  for i in range(len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
    #your code
print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 91089


In [16]:
print(sequences)

[[64], [64, 142], [148], [148, 31], [148, 31, 878], [148, 31, 878, 181], [148, 31, 878, 181, 438], [148, 31, 878, 181, 438, 131], [148, 31, 878, 181, 438, 131, 17], [148, 31, 878, 181, 438, 131, 17, 113], [33], [113], [113, 113], [64], [64, 142], [7], [7, 38], [7, 38, 33], [7, 38, 33, 1379], [7, 38, 33, 1379, 329], [7, 38, 33, 1379, 329, 3], [7, 38, 33, 1379, 329, 3, 249], [7, 38, 33, 1379, 329, 3, 249, 61], [7, 38, 33, 1379, 329, 3, 249, 61, 3], [7, 38, 33, 1379, 329, 3, 249, 61, 3, 2878], [33], [1379], [1379, 1379], [64], [64, 142], [64], [64, 7], [64, 7, 105], [64, 7, 105, 606], [64, 7, 105, 606, 119], [64, 7, 105, 606, 119, 11], [64, 7, 105, 606, 119, 11, 2879], [64, 7, 105, 606, 119, 11, 2879, 380], [64, 7, 105, 606, 119, 11, 2879, 380, 3], [64, 7, 105, 606, 119, 11, 2879, 380, 3, 1], [64, 7, 105, 606, 119, 11, 2879, 380, 3, 1, 176], [33], [31], [31, 2219], [31, 2219, 31], [31, 2219, 31, 2219], [64], [64, 142], [73], [73, 67], [73, 67, 405], [73, 67, 405, 26], [73, 67, 405, 26, 2]

In [17]:
# 모든 샘플에서 길이가 가장 긴 샘플의 길이 출력
max_len = max(len(l) for l in sequences)
print('샘플의 최대 길이: {}'.format(max_len))

샘플의 최대 길이: 16


In [18]:
# max_len에 맞춰 padding. 이때 max_len보다 길이가 짧으면 짧은 샘플의 앞에 0을 붙여 채운다.
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)

[[  0   0   0 ...   0   0  64]
 [  0   0   0 ...   0  64 142]
 [  0   0   0 ...   0   0 148]
 ...
 [  0   0   0 ... 987   2   5]
 [  0   0   0 ...   2   5  27]
 [  0   0   0 ...   5  27 401]]


In [19]:
# padding 작업 확인
print(sequences[:5])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  64]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0  64 142]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 148]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 148  31]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 148  31 878]]


In [20]:
# 각 샘플의 마지막 단어를 numpy를 사용해 레이블로 분리한다. x: 리스트의 마지막 값을 제외하고 저장한 것 / y: 리스트의 마지막 값만 저장한 것=레이블
sequences = np.array(sequences)
x = sequences[:,:-1]
y = sequences[:,-1]
print('x: ', x, sep='\n')
print('y:', y)

x: 
[[  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0  64]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   6 987   2]
 [  0   0   0 ... 987   2   5]
 [  0   0   0 ...   2   5  27]]
y: [ 64 142 148 ...   5  27 401]


In [21]:
# one-hot encoding
y = to_categorical(y, num_classes=vocab_size)
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## 모델 설계

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [23]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # loss function은 cross entropy function, optimizer는 adam 사용
model.fit(x,y, epochs=100, verbose=2)

Epoch 1/100
2847/2847 - 36s - loss: 6.8257 - accuracy: 0.0358 - 36s/epoch - 13ms/step
Epoch 2/100
2847/2847 - 17s - loss: 6.3729 - accuracy: 0.0525 - 17s/epoch - 6ms/step
Epoch 3/100
2847/2847 - 16s - loss: 6.1173 - accuracy: 0.0702 - 16s/epoch - 6ms/step
Epoch 4/100
2847/2847 - 16s - loss: 5.9009 - accuracy: 0.0810 - 16s/epoch - 6ms/step
Epoch 5/100
2847/2847 - 17s - loss: 5.7008 - accuracy: 0.0878 - 17s/epoch - 6ms/step
Epoch 6/100
2847/2847 - 16s - loss: 5.5112 - accuracy: 0.0936 - 16s/epoch - 6ms/step
Epoch 7/100
2847/2847 - 16s - loss: 5.3297 - accuracy: 0.0987 - 16s/epoch - 6ms/step
Epoch 8/100
2847/2847 - 16s - loss: 5.1622 - accuracy: 0.1058 - 16s/epoch - 6ms/step
Epoch 9/100
2847/2847 - 16s - loss: 5.0043 - accuracy: 0.1147 - 16s/epoch - 6ms/step
Epoch 10/100
2847/2847 - 16s - loss: 4.8568 - accuracy: 0.1252 - 16s/epoch - 6ms/step
Epoch 11/100
2847/2847 - 16s - loss: 4.7199 - accuracy: 0.1393 - 16s/epoch - 6ms/step
Epoch 12/100
2847/2847 - 16s - loss: 4.5924 - accuracy: 0.1522

<keras.callbacks.History at 0x7f8ce8a106d0>

In [26]:
# 문장을 생성하는 함수 sentence_generation 정의
def sentence_generation(model, tokenizer, current_word, n):
  # (모델, 토크나이저, 현재 단어, 반복할 횟수)
  init_word = current_word
  sentence = ''

  for _ in range(n):
    encoded = tokenizer.texts_to_sequences([current_word])[0]
    encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')
    result = model.predict(encoded, verbose=0)
    result = np.argmax(result, axis=1)

    for word, index in tokenizer.word_index.items():
      if index == result: # 예측한 단어와 인덱스에 동일한 단어가 있을 때
        break

    current_word = current_word + ' ' + word
    sentence = sentence + ' ' + word
  sentence = init_word + sentence
  return sentence

## 모델의 결과 확인

In [27]:
print(sentence_generation(model, tokenizer, 'love', 10))

love with me and our we'll lies up ambitious son the


In [28]:
print(sentence_generation(model, tokenizer, 'human', 20))

human majesty make it on him i say he has as he is as he is left ' 'twas he directly


In [29]:
print(sentence_generation(model, tokenizer, 'shit', 10))

shit i do not know the king is dead they do
