In [1]:
from pathlib import Path
import pandas as pd


In [19]:
source = Path("data/en-fr.csv")

for i, chunk in enumerate(pd.read_csv(source, chunksize=100_000)):
    chunk.to_csv(f"data/en-fr-chunk-{i:03}.csv", index=False)


In [2]:
filepaths = [f"data/en-fr-chunk-{i:03}.csv" for i in range(226)]

In [3]:
print("".join(open(filepaths[0], encoding="utf-8").readlines()[:4]))

en,fr
Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page,"Il a transformé notre vie | Il a transformé la société | Son fonctionnement | La technologie, moteur du changement Accueil | Concepts | Enseignants | Recherche | Aperçu | Collaborateurs | Web HHCC | Ressources | Commentaires Musée virtuel du Canada"
Site map,Plan du site
Feedback,Rétroaction



In [4]:
import tensorflow as tf

In [266]:
def parse_csv_line(line):
    defaults = 2 * [tf.constant("", dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    return tf.stack(fields[0]), tf.stack(fields[1])


def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def csv_files_dataset(
    filepaths,
    n_readers=8,
    n_read_threads=None,
    n_parse_threads=8,
    batch_size=32,
    shuffle_buffer_size=10_000,
    seed=42,
):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    dataset = dataset.interleave(  # type: ignore
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads,
    )
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size).prefetch(1)


In [334]:
example_set = csv_files_dataset(filepaths[0:2])
# example_set = example_set.take(2)
# list(example_set)

In [335]:
vocabulary_size = 1000
max_length = 50

vectorization_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size, output_sequence_length=max_length
)
vectorization_layer_es = tf.keras.layers.TextVectorization(
    max_tokens=vocabulary_size, output_sequence_length=max_length
)

vectorization_layer_en.adapt(example_set.map(lambda sentences, target: sentences[0]))
vectorization_layer_es.adapt(example_set.map(lambda sentences, target: sentences[1] + b" endofseq"))


In [336]:
vectorization_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'of', 'and', 'to', 'in', 'a', 'for', 'is']

In [337]:
vectorization_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'de', 'startofseq', 'endofseq', 'la', 'et', 'les', 'des', 'le']

In [311]:
vectorization_layer_es(list(example_set.map(lambda sentences, target: target))[0])

<tf.Tensor: shape=(4, 50), dtype=int64, numpy=
array([[18,  1,  1,  3,  1,  4,  1,  1, 10,  1,  1,  1,  1,  1,  1,  1,
         8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  1,  1, 11,  2,  1,  1, 10,  1, 11,  1,  1,  1,  1,  1,  1,
         1,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  1,  1, 19,  1,  1,  1, 11,  1, 10,  1, 28,  2,  1, 11,  1,
         1, 65,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 1,  1,  1,  1,  1,  1,  1, 10,  1,  1,  5,  1, 11,  1,  5, 11,
         1,  4, 18,  1, 19,  1,  3,  1, 10,  1,  1,  8,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]], dtype=int64)>

In [338]:
example_set = example_set.map(lambda sentences, target: (sentences, vectorization_layer_es(target)))

In [327]:
list(example_set)

[((<tf.Tensor: shape=(4,), dtype=string, numpy=
   array([b'In 1974, the Council combined its astronomy and spectroscopy units and created the Herzberg Institute of Astrophysics, where he worked until his retirement in 1995.',
          b'Organized the distribution of a survey of the Assembl\xc3\xa9e communautaire fransaskoise (ACF) to employees of this office willing to complete it.',
          b'Brochures will be distributed to the Francophone community and more elaborate electronic versions will be available on the Internet.',
          b'L AN'], dtype=object)>,
   <tf.Tensor: shape=(4,), dtype=string, numpy=
   array([b"startofseq En 1974, le Conseil fusionne les unit\xc3\xa9s d'astronomie et de spectroscopie et cr\xc3\xa9e en son honneur l'Institut Herzberg d'Astrophysique ; il y travaille jusqu'\xc3\xa0 sa retraite, en 1995.",
          b"startofseq Organisation de la distribution d'un sondage de l'Assembl\xc3\xa9e communautaire fransaskoise (ACF) aux employ\xc3\xa9s de ce bureau

In [339]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = vectorization_layer_en(encoder_inputs)
decoder_input_ids = vectorization_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(
    vocabulary_size, embed_size, mask_zero=True
)
decoder_embedding_layer = tf.keras.layers.Embedding(
    vocabulary_size, embed_size, mask_zero=True
)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocabulary_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)


In [340]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
)
model.fit(example_set, epochs=1)




<keras.callbacks.History at 0x206317b0cd0>