In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf

from tensorflow.keras.layers import TextVectorization

# 1) Define as constantes
MAXLEN_DOC = 100
MAXLEN_SUM = 30
VOCAB_IN  = 10_000
VOCAB_OUT = 6_000
BATCH     = 16_384
AUTOTUNE  = tf.data.AUTOTUNE


2025-07-28 21:24:40.450027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753748680.463712  124646 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753748680.468079  124646 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753748680.480414  124646 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753748680.480429  124646 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753748680.480431  124646 computation_placer.cc:177] computation placer alr

In [2]:
# 2) Carrega e pré‑processa textualmente com TFDS
ds = tfds.load('Gigaword', split='train', shuffle_files=True, data_dir='../data')
 
for example in ds.take(1): # Take one example to inspect
    print(example)

{'document': <tf.Tensor: shape=(), dtype=string, numpy=b'chinese premier wen jiabao met here thursday with visiting international university sports federation -lrb- fisu -rrb- president george e. killian .'>, 'summary': <tf.Tensor: shape=(), dtype=string, numpy=b'chinese premier meets fisu president'>}


I0000 00:00:1753748700.240815  124646 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1548 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2025-07-28 21:25:00.332582: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-07-28 21:25:00.355820: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [3]:
ds = ds.map(lambda ex: {
    'document': tf.strings.regex_replace(ex['document'], r"\s+", " "),
    'summary' : tf.strings.regex_replace(ex['summary'],  r"\s+", " ")
})

for example in ds.take(1): # Take one example to inspect
    print(example)

{'document': <tf.Tensor: shape=(), dtype=string, numpy=b"golf 's major governing bodies have come together to push for their sport to be added to the olympic program in #### .">, 'summary': <tf.Tensor: shape=(), dtype=string, numpy=b'making a pitch for olympic golf major tours come together to push for inclusion at #### games'>}


2025-07-28 21:25:12.257196: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
# 3) Cria camadas TextVectorization
tv_doc = TextVectorization(
    max_tokens=VOCAB_IN,
    output_sequence_length=MAXLEN_DOC,
    standardize='lower_and_strip_punctuation'
)
tv_sum = TextVectorization(
    max_tokens=VOCAB_OUT,
    output_sequence_length=MAXLEN_SUM,
    standardize='lower_and_strip_punctuation'
)

In [5]:
# 4) “Adapta” (treina) o vocabulário nos textos brutos
docs_ds = ds.map(lambda ex: ex['document'])
sums_ds = ds.map(lambda ex: ex['summary'])
tv_doc.adapt(docs_ds.batch(1024))
tv_sum.adapt(sums_ds.batch(1024))

2025-07-28 21:28:53.254024: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
# 5) Pipeline de vetorização puramente em TF
def vectorize_tf(ex):
    return {
        'document': tv_doc(ex['document']),
        'summary' : tv_sum(ex['summary'])
    }

In [7]:
vectorized = (
    ds
    .map(vectorize_tf, num_parallel_calls=AUTOTUNE)
    .cache()                             # materializa uma vez em disco
    .batch(BATCH)
    .prefetch(AUTOTUNE)
)

In [10]:
for example in vectorized.take(1):
    print(example['document'].shape)
    print(example['summary'].shape)

(16384, 100)
(16384, 30)


2025-07-28 21:31:27.427389: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [11]:
# 6) Salva em disco de forma nativa (muito rápido, C/C++)
tf.data.Dataset.save(
    vectorized, 
    "vectorized_gigaword_ds"
)

Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.


Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.
