In [1]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import tensorflow_datasets as tfds
import numpy as np

# Parâmetros
MAXLEN_DOC = 100
MAXLEN_SUM = 30
VOCAB_IN   = 20000
VOCAB_OUT  = 12000
BATCH_ORIG = 16384
AUTOTUNE   = tf.data.AUTOTUNE



2025-08-02 18:33:05.239035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754170385.265467  251670 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754170385.273263  251670 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754170385.294770  251670 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754170385.294798  251670 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754170385.294801  251670 computation_placer.cc:177] computation placer alr

In [2]:
# 1) Adicionar <sos> e <eos> aos resumos

def add_sos_eos(example):
    doc = example['document']                # tf.string scalar
    summ = example['summary']                # tf.string scalar
    summ_sos = tf.strings.join(['<sos>', summ, '<eos>'], separator=' ')
    return {'document': doc, 'summary_sos': summ_sos}

In [3]:
# 2) Carrega e pré‑processa textualmente com TFDS
ds = tfds.load('Gigaword', split='train', shuffle_files=True, data_dir='../data')

for example in ds.take(1): # Take one example to inspect
    print(example)

I0000 00:00:1754170396.997488  251670 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1192 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2025-08-02 18:33:17.203666: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608


{'document': <tf.Tensor: shape=(), dtype=string, numpy=b'chinese premier wen jiabao met here thursday with visiting international university sports federation -lrb- fisu -rrb- president george e. killian .'>, 'summary': <tf.Tensor: shape=(), dtype=string, numpy=b'chinese premier meets fisu president'>}


2025-08-02 18:33:19.677210: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
ds = ds.map(lambda ex: {
    'document': tf.strings.regex_replace(ex['document'], r"\s+", " "),
    'summary' : tf.strings.regex_replace(ex['summary'],  r"\s+", " ")
})
ds_sos = ds.map(add_sos_eos, num_parallel_calls=AUTOTUNE)


for example in ds.take(1):
    print(example)

{'document': <tf.Tensor: shape=(), dtype=string, numpy=b'russia and its pro-west neighbor georgia engaged in fierce fighting saturday in the disputed region of south ossetia , reports said , as the international community scrambled to prevent an all-out war .'>, 'summary': <tf.Tensor: shape=(), dtype=string, numpy=b'fierce fighting reported in georgia after russian troop surge'>}


2025-08-02 18:33:22.877935: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:
ds_sos = ds.map(add_sos_eos, num_parallel_calls=tf.data.AUTOTUNE)

# 3) Cria camadas TextVectorization
tv_doc = TextVectorization(
    max_tokens=VOCAB_IN,
    output_sequence_length=MAXLEN_DOC,
    standardize='lower_and_strip_punctuation'
)
tv_sum = TextVectorization(
    max_tokens=VOCAB_OUT,
    output_sequence_length=MAXLEN_SUM + 2,  # +2 para <sos> e <eos>
    standardize='lower_and_strip_punctuation'
)


In [6]:
# 4) “Adapta” (treina) o vocabulário nos textos brutos
docs_ds = ds_sos.map(lambda ex: ex['document'])
sums_ds = ds_sos.map(lambda ex: ex['summary_sos'])
tv_doc.adapt(docs_ds.batch(1024))
tv_sum.adapt(sums_ds.batch(1024))

2025-08-02 18:43:45.696277: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [7]:
# 5) Pipeline de vetorização puramente em TF
def vectorize_tf(ex):
    doc = tv_doc(ex['document'])
    summ = tv_sum(ex['summary_sos'])
    return {'document': doc, 'summary': summ}

In [8]:
vectorized = (
    ds_sos
    .map(vectorize_tf, num_parallel_calls=AUTOTUNE)
    .cache()
    .batch(BATCH_ORIG)
    .prefetch(AUTOTUNE)
)


In [9]:
for example in vectorized.take(1):
    print(example['document'].shape)
    print(example['summary'].shape)

(16384, 100)
(16384, 32)


2025-08-02 18:43:56.256620: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [10]:
# 6) Salva em disco de forma nativa (muito rápido, C/C++)
tf.data.Dataset.save(
    vectorized, 
    "vectorized_gigaword_ds"
)

In [11]:
import numpy as np

# Salvar vocabulário
vocab_doc = tv_doc.get_vocabulary()
np.save("/home/olavo-dalberto/models/vocab_doc.npy", vocab_doc)

vocab_sum = tv_sum.get_vocabulary()
np.save("/home/olavo-dalberto/models/vocab_sum.npy", vocab_sum)

# Carregar vocabulário
vocab_doc_loaded = np.load("/home/olavo-dalberto/models/vocab_doc.npy", allow_pickle=True)
vocab_sum_loaded = np.load("/home/olavo-dalberto/models/vocab_sum.npy", allow_pickle=True)

In [16]:
print(vocab_doc_loaded)

['' '[UNK]' 'the' ... 'ethanol' 'credits' 'banners']


In [18]:
tv_sum.get_vocabulary().index('sos')

2

In [12]:
# Salvar modelo Keras com TextVectorization
model_doc = tf.keras.Sequential([tv_doc])  # seu pipeline
model_doc.save("/home/olavo-dalberto/models/tv_doc_model.keras")

model_sum = tf.keras.Sequential([tv_sum])  # seu pipeline
model_sum.save("/home/olavo-dalberto/models/tv_sum_model.keras")

# Carregar depois
# loaded_model = tf.keras.models.load_model("/home/olavo-dalberto/models/tv_doc_model.keras")

  return saving_lib.save_model(model, filepath)


In [4]:
import tensorflow as tf

loaded_model = tf.keras.models.load_model("/home/olavo-dalberto/models/tv_doc_model.keras")
loaded_model.get_layer(index=0)

IndexError: list index out of range