In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/home/marcelo/miniconda3/envs/nlp"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
import tensorflow as tf
import keras
import random
import gzip
import numpy as np

from keras.utils import register_keras_serializable
from keras.layers import Layer, Embedding, Input, Dense, TextVectorization
from keras.models import Model
from keras.layers import MultiHeadAttention, LayerNormalization, Dropout
from keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split

print(f'Tensorflow Version : {tf.__version__}')
print(f'Keras Version : {keras.__version__}')

In [3]:
# Parameters
vocab_size = 40000  # Vocabulary size
max_len = 80  # Maximum length of input sequences
dim_model = 128  # Dimension of the model
num_heads = 2  # Number of attention heads
ff_dim = 512  # Dimension of the feed-forward layer
num_blocks = 2  # Number of transformer blocks
dropout = 0.1  # Dropout rate

batch_size = 256

In [None]:
filename = './data/clearned_corpus.txt.gz'
with gzip.open(filename, 'rt') as f:
    lines = f.readlines()

print('Lines:', len(lines))
vectorization = TextVectorization(
    max_tokens=vocab_size, 
    output_mode="int", 
    output_sequence_length=max_len,
    standardize=None
    )
vectorization.adapt(lines)
print('Vocab Size:', len(vectorization.get_vocabulary()))
vocab = [str(x) for x in vectorization.get_vocabulary()]

text_pairs = []
for line in lines:
    _split = line.split()

    x, y = (' '.join(_split[:-2]), ' '.join(_split[1:]))
    text_pairs.append((x, y))

random.shuffle(text_pairs)
train_pairs, test_pairs = train_test_split(text_pairs, test_size=0.10)
#num_val_samples = int(0.15 * len(text_pairs))
#num_train_samples = len(text_pairs) - 2 * num_val_samples
#train_pairs = text_pairs[:num_train_samples]
#val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
#test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [None]:
import random
pair = random.choice(text_pairs)
print(f'length: {len(pair[0].split())} - text: [{pair[0]}] - vector:\n{vectorization(pair[0])}')
print(f'length: {len(pair[1].split())} - text: [{pair[1]}] - vector:\n{vectorization(pair[1])}')

In [6]:
def format_dataset(x, y):
    x = vectorization(x)
    y = vectorization(y)
    return x, y 


def make_dataset(pairs):
    x, y = zip(*pairs)
    x = list(x)
    y = list(y)
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(tf.data.AUTOTUNE).cache()

train_ds = make_dataset(train_pairs)
#val_ds = make_dataset(val_pairs)

In [None]:
if True:
    for final_text in train_ds.take(1).cache():
        print(final_text)

In [8]:
@register_keras_serializable()
class PositionalEmbedding(Layer):
    def __init__(self, vocab_size, dim_model, max_len, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.dim_model = dim_model
        self.max_len = max_len
        #self.token_emb = Embedding(input_dim=vocab_size, output_dim=d_model)
        #self.pos_emb = Embedding(input_dim=max_len, output_dim=d_model)
        
    def build(self, input_shape):
        # Initialize the token embedding layer
        self.token_emb = Embedding(input_dim=self.vocab_size, output_dim=self.dim_model)
        # Initialize the positional embedding layer
        self.pos_emb = Embedding(input_dim=self.max_len, output_dim=self.dim_model)
        # Mark the layer as built
        super(PositionalEmbedding, self).build(input_shape)
        
    def call(self, x):
        max_len = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=max_len, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'd_model': self.dim_model,
            'max_len': self.max_len
        })
        return config    

In [9]:
def transformer_block(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    x = Dense(ff_dim, activation="relu")(res)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    x = LayerNormalization(epsilon=1e-6)(x)
    return x + res

def build_model(vocab_size, max_len, dim_model, num_heads, ff_dim, num_blocks, dropout=0):
    inputs = Input(shape=(max_len,))
    x = PositionalEmbedding(vocab_size, dim_model, max_len)(inputs)
    for _ in range(num_blocks):
        x = transformer_block(x, dim_model, num_heads, ff_dim, dropout)
    outputs = Dense(vocab_size, activation="softmax")(x)
    return Model(inputs, outputs)

# Build and compile the model
model = build_model(vocab_size, max_len, dim_model, num_heads, ff_dim, num_blocks, dropout)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Summary of the model
model.summary()

In [None]:
# Build and compile the model
model = build_model(vocab_size, max_len, dim_model, num_heads, ff_dim, num_blocks, dropout)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Summary of the model
model.summary()

In [53]:
class TextGeneration(tf.keras.callbacks.Callback):
    def __init__(self, tokenizer, test_pairs, max_length=100, temperature=1.0, model = None):
        super(TextGeneration, self).__init__()
        self.tokenizer = tokenizer
        self.test_pairs = test_pairs        
        self.max_length = max_length
        self.temperature = temperature
        self.vocab = self.tokenizer.get_vocabulary()
        self.index_word = dict(zip(range(len(self.vocab)), self.vocab))
        self.m = model

    def on_epoch_end(self, epoch, logs=None):
        print(f'\nGenerating text after epoch: {epoch + 1}')
        prompt, label, text_generated = self.auto_generated_text()
        print(f'Prompt[{len(prompt.split())}]:\n{prompt}')
        print(f'Label[{len(label.split())}]:\n{label}')
        print(f'Generated[{len(text_generated.split())}]:\n{text_generated}')
        
    def auto_generated_text(self):
        prompt, label = random.choice(self.test_pairs)
        text_generated = self.generate_text(prompt)
        return prompt, label, text_generated
        
    def generate_text(self, prompt):        
        input_eval = self.tokenizer(prompt)
        input_eval = tf.expand_dims(input_eval, 0)
        text_generated = []
        for i in range(self.max_length):
            if self.m is None:
                predictions = self.model(input_eval)
            else:
                predictions = self.m(input_eval)
            predictions = predictions / self.temperature            
            predicted_id = np.argmax(predictions[0, i, :])                       
            word_predicted = self.index_word[predicted_id]
            if word_predicted == '[end]':
                break
            text_generated.append(word_predicted)

        final_text = ' '.join(text_generated)
        
        return final_text

In [12]:
model_filename = './nano_gpt_by_marcelo.keras'

if os.path.exists(model_filename):
    model = keras.models.load_model(model_filename)
    print(f'Model loaded from {model_filename}')
    print(model)

In [None]:
checkpoint_callback = ModelCheckpoint(
        filepath='./nano_gpt_by_marcelo.keras',
        save_weights_only=False,
        save_freq='epoch')

    # Create the TextGeneration callback
text_gen_callback = TextGeneration(vectorization, start_string="o brasil é um país da américa do sul", max_length=max_len, temperature=1)


    # Train the model with the callback
model.fit(train_ds, epochs=50, callbacks=[checkpoint_callback, text_gen_callback])

In [28]:
index_word = dict(zip(range(len(vocab)), vocab))

In [None]:
text_gen_callback = TextGeneration(vectorization, test_pairs=test_pairs, max_length=max_len, temperature=1, model=model)
prompt, label, generated = text_gen_callback.auto_generated_text()

print(f'Prompt[{len(prompt.split())}]:\n{prompt}')
print(f'Label[{len(label.split())}]:\n{label}')
print(f'Generated[{len(generated.split())}]:\n{generated}')

generated = text_gen_callback.generate_text('a américa do sul é o continente mais')
print(f'Generated[{len(generated.split())}]:\n{generated}')

In [None]:
final_text, label = random.choice(test_pairs)

print(f'Input[{len(final_text.split())}]: {final_text}') 
print(f'Label[{len(label.split())}]: {label}')

input_eval = vectorization(final_text)
input_eval = tf.expand_dims(input_eval, 0)
predictions = model(input_eval)
print(predictions.shape)
predictions

#for i in range(0, max_len):  
#  input_eval = np.reshape(input_eval, (-1, 1))
#  predictions = model(input_eval)  
#  predicted_id = np.argmax(predictions[i, 0, :])
#  word = index_word[predicted_id]
#  print(f'[{i}]: Predicted_id: [{predicted_id}] - Word: [{word}]')

In [None]:
for i in range(0, max_len):  
  input_eval = np.reshape(input_eval, (-1, 1))
  predictions = model(input_eval)  
  predicted_id = np.argmax(predictions[i, 0, :])
  word = index_word[predicted_id]
  print(f'[{i}]: Predicted_id: [{predicted_id}] - Word: [{word}]')

In [None]:
import numpy as np
text_gen_callback = TextGeneration(vectorization, start_string="a literatura latino americana inclui as", max_length=max_len, temperature=1, model=model)
t = text_gen_callback.generate_text()
print(f'Tokens {len(t.split())} - Length: {len(t)}\n{t}')