In [1]:
from pathlib import Path
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
import subprocess
import shutil
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None


def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


In [2]:
%%time

user = "dhruvildave"
dataset = "en-fr-translation-dataset"
data_dir = Path("data")

if not ON_KAGGLE:
    download_dataset_from_kaggle(user, dataset, data_dir)
    full_data_path = data_dir / "en-fr.csv"
else:
    full_data_path = Path("/kaggle/input/en-fr-translation-dataset/en-fr.csv")


CPU times: total: 0 ns
Wall time: 0 ns


In [3]:
%%time

chunk_size = 100_000
data_chunks = Path("data_chunks")

if not os.path.exists(data_chunks):
    data_chunks.mkdir(parents=True)
    chunks = pd.read_csv(full_data_path, chunksize=chunk_size, encoding="utf-8")
    for i, chunk in enumerate(chunks):
        chunk_path = data_chunks / f"en-fr-chunk-{i:03}.csv"
        chunk.to_csv(chunk_path, index=False, encoding="utf-8")


CPU times: total: 0 ns
Wall time: 0 ns


In [4]:
filepaths = [f"{data_chunks}/{chunk_file}" for chunk_file in os.listdir(data_chunks)]
train_paths = filepaths[:1]
valid_paths = filepaths[1:2]

with open(train_paths[0], encoding="utf8") as f:
    for line in f.readlines()[:5]:
        print(line, end="")


en,fr
Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page,"Il a transformé notre vie | Il a transformé la société | Son fonctionnement | La technologie, moteur du changement Accueil | Concepts | Enseignants | Recherche | Aperçu | Collaborateurs | Web HHCC | Ressources | Commentaires Musée virtuel du Canada"
Site map,Plan du site
Feedback,Rétroaction
Credits,Crédits


In [6]:
def parse_csv_line(line):
    defaults = 2 * [tf.constant("", dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    return tf.stack(fields[0]), tf.stack(fields[1])


def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def csv_files_dataset(
    filepaths,
    n_readers=12,
    n_read_threads=12,
    n_parse_threads=12,
    batch_size=32,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=42,
):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    dataset = dataset.interleave(  # type: ignore
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads,
    )
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=n_parse_threads)
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)


In [27]:
train_ds = csv_files_dataset(train_paths, shuffle=True)
valid_ds = csv_files_dataset(valid_paths)


In [8]:
vocabulary_size = 5000
max_length = 50

vectorization_layer_en = layers.TextVectorization(
    max_tokens=vocabulary_size, output_sequence_length=max_length
)
vectorization_layer_es = layers.TextVectorization(
    max_tokens=vocabulary_size, output_sequence_length=max_length
)

vectorization_layer_en.adapt(train_ds.map(lambda sentences, target: sentences[0]))
vectorization_layer_es.adapt(train_ds.map(lambda sentences, target: sentences[1] + b" endofseq"))


In [9]:
vectorization_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'of', 'and', 'to', 'in', 'a', 'for', 'is']

In [10]:
vectorization_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'de', 'startofseq', 'endofseq', 'la', 'et', 'les', 'des', 'le']

In [88]:
vectorization_layer_es(list(train_ds.map(lambda sentences, target: target))[0])

<tf.Tensor: shape=(32, 50), dtype=int64, numpy=
array([[   7,  646,    7, ...,    0,    0,    0],
       [1478,    2,  807, ...,    0,    0,    0],
       [  64,   71,    2, ...,    0,    0,    0],
       ...,
       [ 306,  181,    9, ...,    0,    0,    0],
       [   1,    1, 3777, ...,    0,    0,    0],
       [   7,   73,   24, ...,   28,   10, 4271]], dtype=int64)>

In [11]:
train_ds = train_ds.map(
    lambda sentences, target: (sentences, vectorization_layer_es(target))
).prefetch(1)

valid_ds = valid_ds.map(
    lambda sentences, target: (sentences, vectorization_layer_es(target))
).prefetch(1)


In [23]:
# list(train_ds)

In [13]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

embed_size = 128
encoder_input_ids = vectorization_layer_en(encoder_inputs)
decoder_input_ids = vectorization_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(
    vocabulary_size, embed_size, mask_zero=True
)
decoder_embedding_layer = tf.keras.layers.Embedding(
    vocabulary_size, embed_size, mask_zero=True
)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

output_layer = tf.keras.layers.Dense(vocabulary_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)


In [14]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
)
model.fit(train_ds, epochs=1, validation_data=valid_ds)




<keras.callbacks.History at 0x1b739dc7af0>

In [15]:
import numpy as np


def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = vectorization_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()


In [22]:
translate("Chocolate spreads have seen the largest")

'les [UNK] de la personne [UNK]'