Подключаем tensorflow и прочие необходимые библиотеки:

In [40]:
import tensorflow as tf

import numpy as np
import os
import time

Загружаем датасет, произведение Эдварда Моргана Форстера "Комната с видом":

In [41]:
path_to_file = tf.keras.utils.get_file('a_room_with_a_view.txt', 'https://www.gutenberg.org/cache/epub/2641/pg2641.txt')

Проверяем, что все загрузилось.

In [42]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')
print(text[:250])

Length of text: 403396 characters
﻿The Project Gutenberg eBook of A Room With A View, by E. M. Forster

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give


In [43]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

95 unique characters


Перед обучением необходимо преобразовать строки в числовое представление.

Слой tf.keras.layers.StringLookup может преобразовывать каждый символ в числовой идентификатор. Просто сначала нужно разделить текст на токены.

In [44]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [45]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

In [46]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[58, 59, 60, 61, 62, 63, 64], [81, 82, 83]]>

Инвертируем это представление и восстанавливаем из него удобочитаемые строки

In [47]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [48]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [49]:
tf.strings.reduce_join(chars, axis=-1).numpy()

array([b'abcdefg', b'xyz'], dtype=object)

In [50]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

Преобразуем текстовый вектор в поток индексов символов:

In [51]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(403396,), dtype=int64, numpy=array([95, 49, 65, ...,  1,  2,  1], dtype=int64)>

In [52]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


In [53]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

﻿
T
h
e
 
P
r
o
j
e


In [54]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

batch метод позволяет легко преобразовать эти отдельные символы в последовательности нужного размера.

In [55]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'\xef\xbb\xbf' b'T' b'h' b'e' b' ' b'P' b'r' b'o' b'j' b'e' b'c' b't'
 b' ' b'G' b'u' b't' b'e' b'n' b'b' b'e' b'r' b'g' b' ' b'e' b'B' b'o'
 b'o' b'k' b' ' b'o' b'f' b' ' b'A' b' ' b'R' b'o' b'o' b'm' b' ' b'W'
 b'i' b't' b'h' b' ' b'A' b' ' b'V' b'i' b'e' b'w' b',' b' ' b'b' b'y'
 b' ' b'E' b'.' b' ' b'M' b'.' b' ' b'F' b'o' b'r' b's' b't' b'e' b'r'
 b'\r' b'\n' b'\r' b'\n' b'T' b'h' b'i' b's' b' ' b'e' b'B' b'o' b'o' b'k'
 b' ' b'i' b's' b' ' b'f' b'o' b'r' b' ' b't' b'h' b'e' b' ' b'u' b's'
 b'e' b' ' b'o' b'f' b' '], shape=(101,), dtype=string)


Для обучения понадобится набор данных пар (input, label). Где input и label являются последовательностями. На каждом временном шаге вводом является текущий символ, а меткой является следующий символ.

Вот функция, которая принимает последовательность в качестве входных данных, дублирует и сдвигает ее, чтобы выровнять ввод и метку для каждого временного шага:

In [56]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [57]:
dataset = sequences.map(split_input_target)

In [58]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'\xef\xbb\xbfThe Project Gutenberg eBook of A Room With A View, by E. M. Forster\r\n\r\nThis eBook is for the use of'
Target: b'The Project Gutenberg eBook of A Room With A View, by E. M. Forster\r\n\r\nThis eBook is for the use of '


Перетасовываем данные и распределяем по пакетам

In [59]:
# Batch size
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [60]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [61]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [62]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [63]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 96) # (batch_size, sequence_length, vocab_size)


In [64]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [65]:
sampled_indices

array([35, 80, 27, 27, 12, 82, 80, 13, 47, 50, 74, 21, 91, 19, 35,  7, 65,
       69, 17, 87, 37, 61, 78, 45, 24, 87, 32, 64,  4, 16, 45, 70, 94, 82,
       59, 49, 67, 61, 53, 19, 65, 60, 81, 81, 13, 72,  1,  7, 94,  6, 49,
       51, 80,  3, 12, 31, 18, 13, 14, 29, 20, 15, 49, 61, 16, 34, 22, 36,
       58, 55, 16, 32, 31, 13, 25, 54, 58, 95, 27, 17, 83, 46, 18, 35, 23,
       57, 46, 54, 62, 19, 92, 20, 64, 50, 14, 68, 77, 18, 46, 83],
      dtype=int64)

In [66]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 96)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.5642695, shape=(), dtype=float32)


In [67]:
tf.exp(example_batch_mean_loss).numpy()

95.99245

In [68]:
model.compile(optimizer='adam', loss=loss)


In [69]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

Обучаем модель, используя 30 эпох.

In [70]:
EPOCHS = 30
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Следующее делает одношаговый прогноз:

In [71]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [72]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


In [None]:
Запускаем в цикле, чтобы сгенерировать текст.

In [73]:
start = time.time()
states = None
result = []

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ren taken idea our partition
approached and beacens life life. To know it’s all;
have worried about it. She thought it must be near to everyone, and yet Lucy’s ngart! I have just
say he would not tell her.”

Then they start for this remark in Mr. Getens to wind he wnow what she has writing and draw out it was not
dislike a man for such an hour andied, and at the Beehive! Would you any you again, and has in ty
destrict, I am not to talk. I wanted to live and hinder than I
shall go alo.”

“That place, everyone already,” was her reply? Hurchwy danged her spoke of
his kepthbods, seements alone saint that his profeed society, and their distribute or room window. At taste nor
word. But whose exertance with that one cannot seazed Mrs. Butterworth shipped as the
cart-cool, than to tell he was only her breakfast knew with any one she gakes, we thought the
roads are safe.” But this was a student of vanop; of “dear of a moneth
he touched the strapet at the stairs.

“The elartines