In [1]:
!pip install -qq keras_nlp

In [2]:
!wget -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=115avJgRM8P-rQf9ER2MnSEhxIvTlHJhS" -O book.txt

In [3]:
import warnings
warnings.filterwarnings("ignore")

import re, string

import tensorflow as tf
import keras_nlp

In [4]:
BATCH_SIZE = 16
SEQ_LEN = 16
MIN_TRAINING_SEQ_LEN = 3

EMBED_DIM = 64
FEED_FORWARD_DIM = 256
NUM_HEADS = 4
NUM_LAYERS = 2
VOCAB_SIZE = 10000

EPOCHS = 80

NUM_TOKENS_TO_GENERATE = 30

In [5]:
with open("book.txt", "r", encoding="utf-8") as inbook:
    book = inbook.read()
    book = re.sub(' +',' ', book)
    book = " ".join([w.lower() for w in book.split()])
    book = book.replace("\n", "")

In [6]:
with open("lower_book.txt", "w", encoding="utf-8") as inbook:
    lbook = book.split()
    for i in range(0, len(lbook), SEQ_LEN):
        inbook.write(" ".join(lbook[i:i+SEQ_LEN]+["\n"]))

In [22]:
train_ds = (
    tf.data.TextLineDataset("lower_book.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(1)
  #  .shuffle(buffer_size=10000)
)

In [23]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

In [24]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [25]:
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)

In [26]:
train_ds = (
    tf.data.TextLineDataset("lower_book.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=10000)
)

In [27]:
def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels

train_ds = train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [28]:
inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding( 
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)

for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  

outputs = tf.keras.layers.Dense(VOCAB_SIZE)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)  

model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [29]:
model.fit(train_ds, epochs=EPOCHS)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7f4c42b17ca0>

In [30]:
prompt_tokens = tf.convert_to_tensor([tokenizer.token_to_id("[BOS]")])

def token_logits_fn(inputs):
    cur_len = inputs.shape[1]
    output = model(inputs)
    return output[:, cur_len - 1, :] 

In [31]:
output_tokens = keras_nlp.utils.top_p_search(
    token_logits_fn,
    prompt_tokens,
    max_length=15,
    p=0.7,
    from_logits=True,
)
txt = tokenizer.detokenize(output_tokens)
txt = re.sub("(?:[^|]*\|)?([^\]|]*)\]", '', txt.numpy().decode("utf-8"))
txt = re.sub(' +', ' ', txt).strip()
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
на язык , от коих я теперь отвык



In [39]:
output_tokens = keras_nlp.utils.top_p_search(
    token_logits_fn,
    prompt_tokens,
    max_length=15,
    p=0.7,
    from_logits=True,
)
txt = tokenizer.detokenize(output_tokens)
txt = re.sub("(?:[^|]*\|)?([^\]|]*)\]", '', txt.numpy().decode("utf-8"))
txt = re.sub(' +', ' ', txt).strip()
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
разлуку , татьяна ропщет на ручей



### Casual Vocabulary

In [7]:
train_ds = (
    tf.data.TextLineDataset("lower_book.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(1)
)

In [8]:
max_features = 10000
sequence_length = 25

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize="lower",
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    text = vectorize_layer(text)
    return text, label

vectorize_layer.adapt(train_ds)

In [9]:
BATCH_SIZE = 32

train_ds = (
    tf.data.TextLineDataset("lower_book.txt")
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=10000)
)

def preprocess(inputs):
    outputs = vectorize_layer(inputs)
    input_text = outputs[:, :-1]
    target_text = outputs[:, 1:]
    return input_text, target_text

train_ds = train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [10]:
inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)

embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding( 
    vocabulary_size=max_features,
    sequence_length=sequence_length,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)

for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  

outputs = tf.keras.layers.Dense(max_features)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)  

model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
callbacks = [
             tf.keras.callbacks.ReduceLROnPlateau(patience=3, monitor="loss"),
             tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, restore_best_weights=True),
            ]

In [11]:
model.fit(train_ds, epochs=EPOCHS, callbacks=callbacks)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7ffa2c0a1040>

In [22]:
prompt_tokens = tf.expand_dims(vectorize_layer("Иду ")[:1], axis=0)

def token_logits_fn(inputs):
    output = model(inputs)
    return output[:, -1, :] 

In [26]:
import numpy as np

vocabulary = vectorize_layer.get_vocabulary()
vocab_arr = np.asarray(vocabulary) 

In [27]:
output_tokens = keras_nlp.utils.top_p_search(
    token_logits_fn,
    prompt_tokens,
    max_length=15,
    p=0.7,
    from_logits=True,
)

" ".join(vocab_arr[output_tokens])

'[UNK] смотрит он и отвечает: [UNK] x татьяна, по совету няни сбираясь ночью ворожить, тихонько'

In [35]:
output_tokens = keras_nlp.utils.top_p_search(
    token_logits_fn,
    prompt_tokens,
    max_length=20,
    p=0.8,
    from_logits=True,
)

" ".join(vocab_arr[output_tokens])

'[UNK] вихорь шумный; чета мелькает за четой. к минуте мщенья приближаясь, онегин, втайне усмехаясь, подходит к   с [UNK]'

In [41]:
output_tokens = keras_nlp.utils.top_p_search(
    token_logits_fn,
    prompt_tokens,
    max_length=24,
    p=0.4,
    from_logits=True,
)

" ".join(vocab_arr[output_tokens])

'[UNK] мужички-то все [UNK] гребут лопатой серебро; кому поем, тому добро и слава!" но сулит утраты  как ты, [UNK] вот что  и'

### GRU with char-level

In [4]:
import numpy as np

In [5]:
with open("book.txt", "r", encoding="utf-8") as inbook:
    text = inbook.read()

text = text + text
vocab = sorted(set(text))

len(vocab)

131

In [6]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [8]:
seq_length = 100
BATCH_SIZE = 64
BUFFER_SIZE = 10000

examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)\
                              .batch(seq_length+1, drop_remainder=True)\
                              .map(split_input_target)\
                              .shuffle(BUFFER_SIZE)\
                              .batch(BATCH_SIZE, drop_remainder=True)\
                              .prefetch(tf.data.AUTOTUNE)

In [9]:
vocab_size = len(vocab)
embedding_dim = 128
hidden = 1024

In [10]:
def build_model(vocab_size, embedding_dim, hidden):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),                    
        tf.keras.layers.GRU(hidden*2,
                            return_sequences=True,
                            stateful=False,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(hidden,
                            return_sequences=True,
                            stateful=False,
                            recurrent_initializer='glorot_uniform'),
         tf.keras.layers.GRU(hidden,
                             return_sequences=True,
                             stateful=False,
                             recurrent_initializer='glorot_uniform'),                                  
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [11]:
model = build_model(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden=hidden)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
callbacks = [
             tf.keras.callbacks.ReduceLROnPlateau(patience=2, monitor="loss"),
             tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),
            ]

model.compile(optimizer="adam", loss=loss, metrics=["accuracy"])

In [12]:
history = model.fit(char_dataset, epochs=50, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [14]:
def generate_text(model, start_string, temperature=0.5, num_generate=500):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    # num_generate = 500

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    # temperature = 0.5

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [18]:
print(generate_text(model, start_string=u"И вот идет уже ", temperature=0.001))

И вот идет уже сто                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
