In [None]:
!pip install -q keras-nlp

[K     |████████████████████████████████| 151 kB 5.3 MB/s 
[K     |████████████████████████████████| 5.8 MB 37.9 MB/s 
[K     |████████████████████████████████| 588.3 MB 17 kB/s 
[K     |████████████████████████████████| 5.9 MB 59.1 MB/s 
[K     |████████████████████████████████| 578.1 MB 7.2 kB/s 
[K     |████████████████████████████████| 578.1 MB 5.7 kB/s 
[K     |████████████████████████████████| 4.6 MB 55.2 MB/s 
[?25h

In [None]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras

policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [None]:
from tensorflow.python.ops.math_ops import truediv
keras.utils.get_file(
    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
    extract=True,
)
wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")

keras.utils.get_file(
    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip", extract=True,
)
sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")

vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)

Downloading data from https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Downloading data from https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Downloading data from https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt


In [None]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

# Model params.
NUM_LAYERS = 3
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.1
NORM_EPSILON = 1e-5

# Training params.
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

In [None]:
sst_train_ds = tf.data.experimental.CsvDataset(
    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)

sst_val_ds = tf.data.experimental.CsvDataset(
    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"    
).batch(FINETUNING_BATCH_SIZE)

wiki_train_ds = (
    tf.data.TextLineDataset(wiki_dir+"wiki.train.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)
)

wiki_val_ds = (
    tf.data.TextLineDataset(wiki_dir+"wiki.valid.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)    
)

print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())
#print(sst_val_ds.unbatch().batch(4).take(1).get_single_element())
#print(wiki_train_ds.unbatch().batch(4).take(1).get_single_element())
#print(wiki_val_ds.unbatch().batch(4).take(1).get_single_element())

(<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'hide new secretions from the parental units ',
       b'contains no wit , only labored gags ',
       b'that loves its characters and communicates something rather beautiful about human nature ',
       b'remains utterly satisfied to remain the same throughout '],
      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)


In [None]:
multi_hot_layer = keras.layers.TextVectorization(
    max_tokens=4000, output_mode="multi_hot"
)
multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))

regression_layer = keras.layers.Dense(1, activation="sigmoid")

inputs = keras.Input(shape=(), dtype="string")
outputs = regression_layer(multi_hot_layer(inputs))
baseline_model = keras.Model(inputs, outputs)
baseline_model.compile(loss="binary_crossentropy", metrics=["accuracy"])
baseline_model.fit(sst_train_ds, validation_data=sst_val_ds, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f63c64981c0>

In [None]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file,
    sequence_length=SEQ_LENGTH,
    lowercase=True,
    strip_accents=True
)

masker = keras_nlp.layers.MLMMaskGenerator(
    vocabulary_size = tokenizer.vocabulary_size(),
    mask_selection_rate=MASK_RATE,
    mask_selection_length=PREDICTIONS_PER_SEQ,
    mask_token_id=tokenizer.token_to_id("[MASK]")
)

def preprocess(inputs):
  inputs = tokenizer(inputs)
  outputs = masker(inputs)

  features = {
      "tokens": outputs["tokens"],
      "mask_positions": outputs["mask_positions"]
  }
  labels = outputs["mask_ids"]
  weights = outputs["mask_weights"]
  return features, labels, weights

pretrains_ds = wiki_train_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

pretrains_val_ds=wiki_val_ds.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

print(pretrains_val_ds.take(1).get_single_element())


({'tokens': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
array([[ 103, 7849, 2271, ..., 9673, 1012, 7570],
       [7570, 7849, 2271, ..., 1007,  421, 2023],
       [1996,  103, 3940, ...,    0,    0,    0],
       ...,
       [2076, 1996, 2307, ...,    0,    0,    0],
       [3216, 2225, 2083, ...,    0,    0,    0],
       [ 103, 2007, 1045, ...,    0,    0,    0]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
array([[  0,   3,   7, ..., 110, 119, 123],
       [  3,   5,   8, ..., 117, 121, 126],
       [  1,   3,   9, ...,   0,   0,   0],
       ...,
       [  4,   6,   9, ..., 117, 119,   0],
       [  5,   6,  10, ...,   0,   0,   0],
       [  0,   9,  10, ...,   0,   0,   0]])>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
array([[ 7570, 13091,  2004, ...,  3344,  2077, 24000],
       [13091,  2003, 19116, ...,  2170,  1006,  1012],
       [ 2034,  1997,  2007, ...,     0,     0,     0],
       ...,
       [ 1010,  1997, 23133, ..., 

In [None]:
inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=tokenizer.vocabulary_size(),
    sequence_length=SEQ_LENGTH,
    embedding_dim=MODEL_DIM
)
outputs = embedding_layer(inputs)

outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)

for i in range(NUM_LAYERS):
  outputs = keras_nlp.layers.TransformerEncoder(
      intermediate_dim=INTERMEDIATE_DIM,
      num_heads=NUM_HEADS,
      dropout=DROPOUT,
      layer_norm_epsilon=NORM_EPSILON
  )(outputs)

  encoder_model = keras.Model(inputs, outputs)
  encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128)]             0         
                                                                 
 token_and_position_embeddin  (None, 128, 256)         7846400   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 layer_normalization (LayerN  (None, 128, 256)         512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128, 256)          0         
                                                                 
 transformer_encoder (Transf  (None, 128, 256)         527104    
 ormerEncoder)                                             

In [None]:
inputs = {
    "tokens": keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32),
    "mask_positions": keras.Input(shape=(PREDICTIONS_PER_SEQ,), dtype=tf.int32)
}

encoded_tokens = encoder_model(inputs["tokens"])

outputs = keras_nlp.layers.MLMHead(
    embedding_weights=embedding_layer.token_embedding.embeddings, activation="softmax",
)(encoded_tokens, mask_positions=inputs["mask_positions"])

pretraining_model = keras.Model(inputs, outputs)
pretraining_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=PRETRAINING_LEARNING_RATE),
    weighted_metrics=["sparse_categorical_accuracy"],
    jit_compile=True
)

pretraining_model.fit(
    pretrains_ds, validation_data=pretrains_val_ds, epochs=PRETRAINING_EPOCHS
)

encoder_model.save("encoder_model")


Epoch 1/8
      2/Unknown - 2269s 1141s/step - loss: 8.8086 - sparse_categorical_accuracy: 0.0020    

In [None]:
def preprocess(sentences, labels):
  return tokenizer(sentences), labels

  finetune_ds = sst_train_ds.map(
      preprocess, num_parallel_calls=tf.data.AUTOTUNE
  ).prefetch(tf.data.AUTOTUNE)

  finetune_val_ds = sst_val_ds.map(
      preprocess, num_parallel_calls=tf.data.AUTOTUNE
  ).prefetch(tf.data.AUTOTUNE)

print( finetune_val_ds.take(1).get_single_element() )


In [None]:
encoder_model = keras.models.load_model("encoder_model", compile=False)

inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=df.int32)

encoded_tokens = encoder_model(inputs)
pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens)

outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)

finetuning_model = keras.Model(inputs, outputs)
finetuning_model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=FINETUNING_LEARNING_RATE),
    metrics=["accuracy"]
)

finetuning_model.fit(
    finetune_ds, validation_data=finetune_val_ds, epochs=FINETUNING_EPOCHS,
)


In [None]:
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = finetuning_model(tokens)
final_model = keras.Model(inputs, outputs)
final_model.save("final_model")

restored_model = keras.model.load_model("final_model", compile=False)
inference_data = tf.constant(["Terrible, no good, trash.", "So great: I loved it!"])
print(restored_model(inference_data))