In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
gpus = tf.config.experimental.list_physical_devices("GPU")  
for gpu in gpus:
    # 设置内存增长方式 自增长
    tf.config.experimental.set_memory_growth(gpu, True)  

In [None]:
file_path = '../datasets/shakespeare/shakespeare.txt'
text = open(file_path, 'r', encoding='utf-8').read()

print('text length:', len(text))
print(text[:200])

In [None]:
# 生成词表 char -> id
vocab = sorted( set(text) )
print(len(vocab))
print(vocab)

In [None]:
char_to_id = {char:id for id, char in enumerate(vocab)}
print(char_to_id)

In [None]:
ids_of_text = np.array( [char_to_id[char] for char in text] )
print(text[:20])
print(ids_of_text[:20])

In [None]:
def split_input_target(text_ids):
    # [ 1 2 3 4 5] -> [1 2 3 4], [2 3 4 5]
    return text_ids[:-1], text_ids[1:]
seq_length = 100
seq_num = len(ids_of_text) // seq_length
ids_of_text = ids_of_text[:seq_length * seq_num]
print( ids_of_text.shape )

In [None]:
ids_of_text = ids_of_text.reshape(-1, seq_length)
print(ids_of_text.shape)
print(ids_of_text[0])

In [None]:
seq_ids = map(split_input_target, ids_of_text)
seq_ids = np.array(list(seq_ids))
print(seq_ids.shape)
print(seq_ids[0])

In [None]:
train_data = np.array( [item[0] for item in seq_ids] )
train_label = np.array( [item[1] for item in seq_ids])

In [None]:
# Create rnn model
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 1024
batch_size = 64
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim,
                           batch_input_shape = [batch_size, None]),
    keras.layers.GRU(units = rnn_units, stateful = True, return_sequences = True),
    keras.layers.Dense(vocab_size, activation = 'softmax')
])
model.summary()

In [None]:
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy']
)

In [None]:
step = train_data.shape[0] // batch_size
train_data = train_data[:step * batch_size]
train_label = train_label[:step * batch_size]

In [None]:
import os
callbacks_dir = './callbacks'
if not os.path.exists(callbacks_dir):
    os.makedirs(callbacks_dir)
best_model_file_path = os.path.join(callbacks_dir, 'best_text_generate_model.h5')
# load saved model

try:
    model.load_weights(best_model_file_path)
    print('Load weights suc! Continue to fit model.')
except:
    print('Load weights failed! Start to fit new model.')

callbacks = [
    keras.callbacks.EarlyStopping(min_delta=1e-3, patience=10, monitor='loss'),
    keras.callbacks.ModelCheckpoint(best_model_file_path, save_best_only=True, monitor='loss')  
] 

In [None]:
history = model.fit(train_data, train_label,
                    epochs = 20,
                    batch_size = batch_size,
                    callbacks = callbacks
)

In [None]:
predictions = model.predict_classes(train_data, batch_size)

In [None]:
print(train_label[0])
print(predictions[0])

In [None]:
single_batch_size = 1
single_batch_model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim,
                           batch_input_shape = [single_batch_size, None]),
    keras.layers.GRU(units = rnn_units, stateful = True, return_sequences = True),
    keras.layers.Dense(vocab_size, activation = 'softmax')
])
single_batch_model.summary()

In [None]:
single_batch_model.load_weights(best_model_file_path)
single_batch_model.build(input_shape=(1,))

In [None]:
pred = single_batch_model.predict_classes(train_data[0:1], batch_size=1)

In [28]:
print(pred)
print(train_label[0])

[30, 44, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 13, 43, 1, 53, 56, 43, 1, 58, 43, 1, 46, 56, 53, 60, 43, 43, 42, 10, 39, 52, 42, 1, 47, 59, 56, 58, 46, 43, 56, 8, 1, 58, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 15, 50, 50, 10, 0, 31, 54, 43, 39, 49, 1, 1, 57, 54, 43, 39, 49, 1, 0, 0, 24, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 13, 53, 59]
[47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43  1
 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43 39
 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49  6
  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0
 37 53 59]


In [29]:
print(''.join(vocab[id] for id in train_label[0]))
print('----------')
print(''.join(vocab[id] for id in pred))

irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
----------
Rfst Citizen:
Ae ore te hroveed:and iurther. tear me speak.

Cll:
Speak  speak 

Lirst Citizen:
Aou


In [32]:
# 文本生成，每次生成一个字符，并对结果进行抽样（否则结果是固定的，容易陷入循环）
def generate_text(model, start_string, num_generate = 500):
    input_data = [char_to_id[char] for char in start_string]
    input_data = tf.expand_dims(input_data, 0)
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 模型前向传播 -> 推测出下一个字符（可能性）
        # -> 抽样 -> 生成下一个字符 -> 更新输入数据，准备下一次前向传播
        predictions = model(input_data)
        predictions = tf.squeeze(predictions, 0)
        prediction = predictions[-1]
        
        # 抽样
        prediction = prediction.numpy() #[0.01, 0.94...]
        predicted_id = np.random.choice(range(len(prediction)), 1, p = prediction)[0]
        
        #生成下个字符
        text_generated.append( vocab[predicted_id] )
        
        #更新输入数据，准备下一次前向传播
        input_data = tf.expand_dims([predicted_id], 0)
    
    return start_string + ''.join(text_generated)

new_text = generate_text(single_batch_model, "All:")
print(new_text)

All:

GLOUCESTER:
And so was ever to be spice,
One rancour ood and that I should bound to sea:
I am that prize burntly, fair benefit
As have him sours instantly, good friar, be not sweet.

JOHN OF GAUNT:
Cousin, farewell, e got, not I.

Second Murderer:
And now I stay, they shall merry man? thy cabinit
By Aulivily, and himself; and tell him where
God you look you from af any ora,
Come homey with him, Aufidius, with his schole,
As you have made thee ach-own proceedings in the track'd
Whereof these fi
