In [1]:
import numpy as np

import typing
from typing import Any, Tuple

import tensorflow as tf

In [2]:
# Download the file
import pathlib

path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

In [3]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [4]:
target_raw, context_raw = load_data(path_to_file)
print(context_raw[-1])

Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


In [5]:
BUFFER_SIZE = len(context_raw)
BATCH_SIZE = 64

is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [6]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print()
  print(example_target_strings[:5])
  break

tf.Tensor(
[b'Pretendemos escalar esa monta\xc3\xb1a.'
 b'Yo no soy ning\xc3\xban rebelde.' b'Tuve que mentirle a Tom.'
 b'Tenemos que confiar en ellos.'
 b'No estoy acostumbrado a levantarme tan pronto.'], shape=(5,), dtype=string)

tf.Tensor(
[b'We plan to climb that mountain.' b"I'm no rebel."
 b'I had to lie to Tom.' b'We have to trust them.'
 b"I'm not accustomed to getting up so early."], shape=(5,), dtype=string)


In [7]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

In [8]:
max_vocab_size = 5000

context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

In [9]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print()
  print(example_target_strings[:5])
  break

tf.Tensor(
[b'Se dice que Jap\xc3\xb3n es la mayor potencia econ\xc3\xb3mica del mundo.'
 b'\xc3\x89l no sabe contar.' b'Tom desenchuf\xc3\xb3 la l\xc3\xa1mpara.'
 b'La polic\xc3\xada encontr\xc3\xb3 la bicicleta de Tom.'
 b'\xc2\xbfSon japoneses?'], shape=(5,), dtype=string)

tf.Tensor(
[b'It is said that Japan is the greatest economic power in the world.'
 b"He can't count." b'Tom unplugged the lamp.'
 b"The police found Tom's bicycle." b'Are they Japanese?'], shape=(5,), dtype=string)


In [10]:
example_context_strings

<tf.Tensor: shape=(64,), dtype=string, numpy=
array([b'Se dice que Jap\xc3\xb3n es la mayor potencia econ\xc3\xb3mica del mundo.',
       b'\xc3\x89l no sabe contar.',
       b'Tom desenchuf\xc3\xb3 la l\xc3\xa1mpara.',
       b'La polic\xc3\xada encontr\xc3\xb3 la bicicleta de Tom.',
       b'\xc2\xbfSon japoneses?',
       b'Ella no hizo nada m\xc3\xa1s que llorar todo el d\xc3\xada.',
       b'Ella le escribi\xc3\xb3 para decirle que no pod\xc3\xada ir a visitarle el verano siguiente.',
       b'Tras la tormenta lleg\xc3\xb3 la calma.',
       b'Nadie lo sabe con seguridad.',
       b'Yo creo m\xc3\xa1s en la dieta que en los medicamentos.',
       b'Qu\xc3\xa9dense sentados, por favor.',
       b'\xc2\xbfMe explicar\xc3\xadas el significado exacto de la palabra, por favor?',
       b'El nuevo producto me desilucion\xc3\xb3.',
       b'Se puso muy contento de repente.',
       b'Tom golpe\xc3\xb3 su pu\xc3\xb1o sobre la mesa.',
       b'Tom anda encueroles.',
       b'Tom no quiere 

In [11]:
context_text_processor.adapt(train_raw.map(lambda context, target: context))

# Here are the first 10 words from the vocabulary:
context_text_processor.get_vocabulary()[:10]

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


['', '[UNK]', '[START]', '[END]', '.', 'de', 'que', 'a', 'no', 'tom']

In [12]:
example_tokens = context_text_processor(example_context_strings)
example_tokens[:3, :]

<tf.RaggedTensor [[2, 18, 263, 6, 256, 15, 10, 356, 1, 4935, 47, 192, 4, 3],
 [2, 20, 8, 125, 847, 4, 3], [2, 9, 1, 10, 3607, 4, 3]]>

In [13]:
context_vocab = np.array(context_text_processor.get_vocabulary())
tokens = context_vocab[example_tokens[0].numpy()]
' '.join(tokens)

'[START] se dice que japn es la mayor [UNK] econmica del mundo . [END]'

In [14]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

NameError: in user code:

    File "C:\Users\salha\AppData\Local\Temp\ipykernel_47020\2396223496.py", line 3, in process_text  *
        target = target_text_processor(target)

    NameError: name 'target_text_processor' is not defined
