In [16]:
from pathlib import Path
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
import subprocess
import shutil
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import layers

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None


def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


In [32]:
%%time

easy_dataset_user = "devicharith"
easy_dataset = "language-translation-englishfrench"

hard_dataset_user = "dhruvildave"
hard_dataset = "en-fr-translation-dataset"

data_dir = Path("data")

if not ON_KAGGLE:
    download_dataset_from_kaggle(easy_dataset_user, easy_dataset, data_dir)
    download_dataset_from_kaggle(hard_dataset_user, hard_dataset, data_dir)
    easy_dataset_path = data_dir / "eng_-french.csv"
    hard_dataset_path = data_dir / "en-fr.csv"
else:
    easy_dataset_path = Path("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
    hard_dataset_path = Path("/kaggle/input/en-fr-translation-dataset/en-fr.csv")


CPU times: total: 0 ns
Wall time: 999 µs


In [40]:
easy_dataset = pd.read_csv(easy_dataset_path, encoding="utf-8", engine="pyarrow")
easy_dataset = easy_dataset.sample(len(easy_dataset), random_state=42)
easy_dataset.head()

Unnamed: 0,English words/sentences,French words/sentences
2785,Take a seat.,Prends place !
29880,I wish Tom was here.,J'aimerais que Tom soit là.
53776,How did the audition go?,Comment s'est passée l'audition ?
154386,I've no friend to talk to about my problems.,Je n'ai pas d'ami avec lequel je puisse m'entr...
149823,I really like this skirt. Can I try it on?,"J'aime beaucoup cette jupe, puis-je l'essayer ?"


In [68]:
sentences_en = easy_dataset["English words/sentences"].to_numpy()
sentences_fr = easy_dataset["French words/sentences"].to_numpy()

validation_size = 0.1
valid_len = int(validation_size * len(easy_dataset))

sentences_en_train = sentences_en[:-valid_len]
sentences_fr_train = sentences_fr[:-valid_len]

sentences_en_valid = sentences_en[-valid_len:]
sentences_fr_valid = sentences_fr[-valid_len:]

print(sentences_en_train.shape)
print(sentences_en_valid.shape)


(158059,)
(17562,)


In [69]:
def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def from_sentences_dataset(
    sentences_en,
    sentences_fr,
    batch_size=32,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=None,
):
    dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_fr))
    dataset = dataset.map(prepare_input_and_target)
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)


In [75]:
easy_train_ds = from_sentences_dataset(sentences_en_train, sentences_fr_train)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_fr_valid)

# list(easy_train_ds.take(1))


In [77]:
keras.backend.clear_session()
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

easy_train_ds = from_sentences_dataset(sentences_en_train, sentences_fr_train)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_fr_valid)

basic_encoder_decoder = BasicEncoderDecoder()
history = adapt_compile_and_fit(basic_encoder_decoder, easy_train_ds, easy_valid_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


---

In [18]:
%%time

chunk_size = 100_000
chunks_dir = Path("data_chunks")

if not os.path.exists(chunks_dir):
    chunks_dir.mkdir(parents=True)
    chunks = pd.read_csv(full_data_path, chunksize=chunk_size, encoding="utf-8")
    for i, chunk in enumerate(chunks):
        chunk_path = chunks_dir / f"en-fr-chunk-{i:03}.csv"
        chunk.to_csv(chunk_path, index=False, encoding="utf-8")


CPU times: total: 0 ns
Wall time: 0 ns


In [19]:
filepaths = [f"{chunks_dir}/{chunk_file}" for chunk_file in os.listdir(chunks_dir)]

with open(filepaths[0], encoding="utf8") as f:
    for line in f.readlines()[:5]:
        print(line, end="")


en,fr
Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page,"Il a transformé notre vie | Il a transformé la société | Son fonctionnement | La technologie, moteur du changement Accueil | Concepts | Enseignants | Recherche | Aperçu | Collaborateurs | Web HHCC | Ressources | Commentaires Musée virtuel du Canada"
Site map,Plan du site
Feedback,Rétroaction
Credits,Crédits


In [20]:
def parse_csv_line(line):
    defaults = 2 * [tf.constant("", dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    return tf.stack(fields[0]), tf.stack(fields[1])


def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def from_csv_files_dataset(
    filepaths,
    n_readers=12,
    n_read_threads=12,
    n_parse_threads=12,
    batch_size=32,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=42,
):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    dataset = dataset.interleave(  # type: ignore
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers,
        num_parallel_calls=n_read_threads,
    )
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=n_parse_threads)
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)


In [21]:
class BasicEncoderDecoder(keras.Model):
    def __init__(
        self,
        vocabulary_size=5000,
        max_length=50,
        embedding_size=128,
        n_units_lstm=512,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.vectorization_layer_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_length
        )
        self.vectorization_layer_es = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_length
        )

        self.encoder_embedding_layer = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding_layer = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )

        self.encoder = layers.LSTM(n_units_lstm, return_state=True)
        self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)

        self.output_layer = layers.Dense(vocabulary_size, activation="softmax")

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_layer_en(encoder_inputs)
        decoder_input_ids = self.vectorization_layer_es(decoder_inputs)

        encoder_embeddings = self.encoder_embedding_layer(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding_layer(decoder_input_ids)

        encoder_output, *encoder_state = self.encoder(encoder_embeddings)
        decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)

        return self.output_layer(decoder_output)


In [22]:
def adapt_compile_and_fit(model, train_dataset, valid_dataset, epochs=5):
    model.vectorization_layer_en.adapt(
        train_dataset.map(lambda sentences, target: sentences[0])
    )
    model.vectorization_layer_es.adapt(
        valid_dataset.map(lambda sentences, target: sentences[1] + b" endofseq")
    )

    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_layer_es(target))
    ).prefetch(1)

    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_layer_es(target))
    ).prefetch(1)

    model.compile(
        loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
    )
    history = model.fit(
        train_dataset_prepared, epochs=epochs, validation_data=valid_dataset_prepared
    )

    return history


In [23]:
keras.backend.clear_session()
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

train_ds = from_csv_files_dataset(filepaths[0:1], shuffle=True)
valid_ds = from_csv_files_dataset(filepaths[1:2])

basic_encoder_decoder = BasicEncoderDecoder()
history = adapt_compile_and_fit(basic_encoder_decoder, train_ds, valid_ds, epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
def translate(model, sentence_en):
    translation = ""
    for word_idx in range(50):
        X = np.array([sentence_en])  # encoder input
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = model.vectorization_layer_es.get_vocabulary()[
            predicted_word_id
        ]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()


In [25]:
translate(basic_encoder_decoder, "Chocolate spreads have seen the largest")


'les [UNK] de [UNK] ont été [UNK]'