# 생성을 위한 딥러닝

## LSTM으로 텍스트 생성

In [1]:
import keras
import numpy as np

In [2]:
path = keras.utils.get_file('nietxsche.txt', 
                            origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()

In [3]:
len(text)

600893

In [4]:
text[0]

'p'

In [5]:
maxlen = 60
step = 3

sentences = []
next_chars = []

for i in range(0, len(text)-maxlen, step):
    sentences.append(text[i: i+maxlen])
    next_chars.append(text[i + maxlen])
    
print("시퀀스 개수:", len(sentences))

시퀀스 개수: 200278


In [6]:
chars = sorted(list(set(text)))
print("고유한 글자:", len(chars))
char_indices = dict((char, chars.index(char)) for char in chars)

고유한 글자: 57


In [7]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [8]:
x.shape, y.shape

((200278, 60, 57), (200278, 57))

In [9]:
x[0]

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [10]:
y[0]

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False])

In [11]:
from keras import layers, Input

input_tensor = Input(shape=(maxlen, len(chars)))
x = layers.LSTM(128, input_shape=(maxlen, len(chars)))(input_tensor)
output = layers.Dense(len(chars), activation='softmax')(x)

model = keras.models.Model(input_tensor, output)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 60, 57)]          0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               95232     
_________________________________________________________________
dense (Dense)                (None, 57)                7353      
Total params: 102,585
Trainable params: 102,585
Non-trainable params: 0
_________________________________________________________________


In [12]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [13]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
import random, sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

for epoch in range(1, 60):
    print("에포크", epoch)
    model.fit(x, y, batch_size=128, epochs=1, verbose=2)
    
    seed_text = text[start_index: start_index + maxlen]
    print("--- seed text: '{}'".format(seed_text))
    