# Data Preparation for Machine Translation

Library Imports for the jupyter notebook. We are refering to this [blog](https://medium.com/@hunter-j-phillips/putting-it-all-together-the-implemented-transformer-bfb11ac1ddfehttps://medium.com/@hunter-j-phillips/putting-it-all-together-the-implemented-transformer-bfb11ac1ddfe) to understand attention network in depth

In [1]:
!pip install -q portalocker

# importing required libraries
import math
import copy
import time
import random
import spacy
import numpy as np
import os

# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim

# load and build datasets
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import portalocker

# visualization packages
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
device

device(type='cpu')

## Reading the dataframe and converting it into iterable for consuming in pytorch dataset

In [3]:
import pandas as pd
from tqdm import tqdm

In [4]:
df = pd.read_csv(
    "/kaggle/input/machine-translation-dataset-de-en/translation_train.csv"
)
df.head()

Unnamed: 0,english,german
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...
3,A man in a blue shirt is standing on a ladder ...,Ein Mann in einem blauen Hemd steht auf einer ...
4,Two men are at the stove preparing food.,Zwei Männer stehen am Herd und bereiten Essen zu.


In [5]:
df.shape

(29000, 2)

In [6]:
# Perform train - val split
train_df = df.sample(frac=0.95, random_state=200)
val_df = df.drop(train_df.index)

In [7]:
def create_iterable(sample_df):
    """
    This converts pandas dataframe into list of tuples
    consisting of (german sentences, english sentences).

    This iterable in used in our data preparation
    """
    sample_iter = sample_df.to_dict(orient="records")
    out_iter = [(dict1["german"], dict1["english"]) for dict1 in sample_iter]
    print(f"length of iterable: {len(out_iter)}")
    return out_iter

In [8]:
train_iter = create_iterable(train_df)
val_iter = create_iterable(val_df)

length of iterable: 27550
length of iterable: 1450


In [9]:
test_df = pd.read_csv(
    "/kaggle/input/machine-translation-dataset-de-en/translation_test.csv"
)
test_df.head()

Unnamed: 0,english,german
0,A man in an orange hat starring at something.,"Ein Mann mit einem orangefarbenen Hut, der etw..."
1,A Boston Terrier is running on lush green gras...,Ein Boston Terrier läuft über saftig-grünes Gr...
2,A girl in karate uniform breaking a stick with...,Ein Mädchen in einem Karateanzug bricht einen ...
3,Five people wearing winter jackets and helmets...,Fünf Leute in Winterjacken und mit Helmen steh...
4,People are fixing the roof of a house.,Leute Reparieren das Dach eines Hauses.


In [10]:
test_iter = create_iterable(test_df)

length of iterable: 1000


## Create Vocab from data sources

In [11]:
def load_tokenizers():
    """
    Load the German and English tokenizers provided by spaCy.

    Returns:
        spacy_de:     German tokenizer
        spacy_en:     English tokenizer
    """
    try:
        spacy_de = spacy.load("de_core_news_sm")
    except OSError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except OSError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    print("Loaded English and German tokenizers.")
    return spacy_de, spacy_en

In [12]:
def tokenize(text: str, tokenizer):
    """
    Split a string into its tokens using the provided tokenizer.

    Args:
        text:         string
        tokenizer:    tokenizer for the language

    Returns:
        tokenized list of strings
    """
    return [tok.text.lower() for tok in tokenizer.tokenizer(text)]

In [13]:
def yield_tokens(data_iter, tokenizer, index: int):
    """
    Return the tokens for the appropriate language.

    Args:
        data_iter:    text here
        tokenizer:    tokenizer for the language
        index:        index of the language in the tuple | (de=0, en=1)

    Yields:
        sequences based on index
    """
    for from_tuple in data_iter:
        yield tokenizer(from_tuple[index])

In [14]:
def build_vocabulary(
    spacy_de, spacy_en, train_iter, val_iter, test_iter, min_freq: int = 2
):
    def tokenize_de(text: str):
        """
        Call the German tokenizer.

        Args:
            text:         string
            min_freq:     minimum frequency needed to include a word in the vocabulary

        Returns:
            tokenized list of strings
        """
        return tokenize(text, spacy_de)

    def tokenize_en(text: str):
        """
        Call the English tokenizer.

        Args:
            text:         string

        Returns:
            tokenized list of strings
        """
        return tokenize(text, spacy_en)

    print("Building German Vocabulary...")

    #     # load train, val, and test data pipelines
    #     train, val, test = datasets.IWSLT2016(language_pair=("de", "en"))
    train = train_iter
    val = val_iter
    test = test_iter

    # generate source vocabulary
    vocab_src = build_vocab_from_iterator(
        yield_tokens(
            train + val + test, tokenize_de, index=0
        ),  # tokens for each German sentence (index 0)
        min_freq=min_freq,
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    print("Building English Vocabulary...")

    # generate target vocabulary
    vocab_trg = build_vocab_from_iterator(
        yield_tokens(
            train + val + test, tokenize_en, index=1
        ),  # tokens for each English sentence (index 1)
        min_freq=2,  #
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    # set default token for out-of-vocabulary words (OOV)
    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_trg.set_default_index(vocab_trg["<unk>"])

    return vocab_src, vocab_trg

In [15]:
def load_vocab(spacy_de, spacy_en, train_iter, val_iter, test_iter, min_freq: int = 2):
    """
    Args:
        spacy_de:     German tokenizer
        spacy_en:     English tokenizer
        min_freq:     minimum frequency needed to include a word in the vocabulary

    Returns:
        vocab_src:    German vocabulary
        vocab_trg:     English vocabulary
    """

    if not os.path.exists("vocab.pt"):
        # build the German/English vocabulary if it does not exist
        vocab_src, vocab_trg = build_vocabulary(
            spacy_de, spacy_en, train_iter, val_iter, test_iter, min_freq
        )
        # save it to a file
        torch.save((vocab_src, vocab_trg), "vocab.pt")
    else:
        # load the vocab if it exists
        vocab_src, vocab_trg = torch.load("vocab.pt")

    print("Finished.\nVocabulary sizes:")
    print("\tSource:", len(vocab_src))
    print("\tTarget:", len(vocab_trg))
    return vocab_src, vocab_trg

In [16]:
# global variables used later in the script
spacy_de, spacy_en = load_tokenizers()

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Loaded English and German tokenizers.


In [17]:
vocab_src, vocab_trg = load_vocab(spacy_de, spacy_en, train_iter, val_iter, test_iter)

Building German Vocabulary...
Building English Vocabulary...
Finished.
Vocabulary sizes:
	Source: 7983
	Target: 5979


In [18]:
BOS_IDX = vocab_trg["<bos>"]
EOS_IDX = vocab_trg["<eos>"]
PAD_IDX = vocab_trg["<pad>"]

## Perform data preprocess

In [19]:
def data_process(raw_data):
    """
    Process raw sentences by tokenizing and converting to integers based on
    the vocabulary.

    Args:
        raw_data:     German-English sentence pairs
    Returns:
        data:         tokenized data converted to index based on vocabulary
    """
    data = []
    # loop through each sentence pair
    for raw_de, raw_en in tqdm(raw_data):
        de_tensor_ = []
        # tokenize the sentence and convert each word to an integers
        for token in spacy_de.tokenizer(raw_de):
            de_tensor_.append(vocab_src[token.text.lower()])

        en_tensor_ = []
        # tokenize the sentence and convert each word to an integers
        for token in spacy_en.tokenizer(raw_en):
            en_tensor_.append(vocab_trg[token.text.lower()])

        de_tensor_ = torch.tensor(de_tensor_, dtype=torch.long)
        en_tensor_ = torch.tensor(en_tensor_, dtype=torch.long)
        # append tensor representations
        data.append((de_tensor_, en_tensor_))
    return data

In [20]:
# processed data
train_data = data_process(train_iter)
print(f"Train data shape: {len(train_data)}")
val_data = data_process(val_iter)
print(f"Val data shape: {len(val_data)}")
test_data = data_process(test_iter)
print(f"Test data shape: {len(test_data)}")

100%|██████████| 27550/27550 [00:02<00:00, 9778.26it/s]


Train data shape: 27550


100%|██████████| 1450/1450 [00:00<00:00, 10145.84it/s]


Val data shape: 1450


100%|██████████| 1000/1000 [00:00<00:00, 9048.54it/s]

Test data shape: 1000





In [21]:
len(train_data[2][0])

14

## Create collate function

In [22]:
def generate_batch(data_batch):
    """
    Process indexed-sequences by adding <bos>, <eos>, and <pad> tokens.

    Args:
        data_batch:     German-English indexed-sentence pairs

    Returns:
        two batches:    one for German and one for English
    """
    de_batch, en_batch = [], []

    # for each sentence
    for de_item, en_item in data_batch:
        # add <bos> and <eos> indices before and after the sentence
        de_temp = torch.cat(
            [torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0
        ).to(device)
        en_temp = torch.cat(
            [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
        ).to(device)

        # add padding
        de_batch.append(
            pad(
                de_temp,
                (
                    0,  # dimension to pad
                    MAX_PADDING - len(de_temp),  # amount of padding to add
                ),
                value=PAD_IDX,
            )
        )

        # add padding
        en_batch.append(
            pad(
                en_temp,
                (
                    0,  # dimension to pad
                    MAX_PADDING - len(en_temp),  # amount of padding to add
                ),
                value=PAD_IDX,
            )
        )

    return torch.stack(de_batch), torch.stack(en_batch)

## Data loaders Created for training

In [23]:
MAX_PADDING = 20
BATCH_SIZE = 128

train_iter = DataLoader(
    to_map_style_dataset(train_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=generate_batch,
)

valid_iter = DataLoader(
    to_map_style_dataset(val_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=generate_batch,
)

test_iter = DataLoader(
    to_map_style_dataset(test_data),
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True,
    collate_fn=generate_batch,
)