In [None]:
!pip install wandb

!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

!tar -xf dakshina_dataset_v1.0.tar

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[?25l[K     |▏                               | 10 kB 34.6 MB/s eta 0:00:01[K     |▍                               | 20 kB 40.7 MB/s eta 0:00:01[K     |▌                               | 30 kB 33.5 MB/s eta 0:00:01[K     |▊                               | 40 kB 25.0 MB/s eta 0:00:01[K     |█                               | 51 kB 27.5 MB/s eta 0:00:01[K     |█                               | 61 kB 31.0 MB/s eta 0:00:01[K     |█▎                              | 71 kB 22.7 MB/s eta 0:00:01[K     |█▌                              | 81 kB 24.5 MB/s eta 0:00:01[K     |█▋                              | 92 kB 26.4 MB/s eta 0:00:01[K     |█▉                              | 102 kB 27.3 MB/s eta 0:00:01[K     |██                              | 112 kB 27.3 MB/s eta 0:00:01[K     |██▏                             | 122 kB 27.3 MB/s eta 0:00:01[K     |██▍                             | 133 kB 27.3 MB/s eta

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import cv2
import pathlib

DATAPATH = "./dakshina_dataset_v1.0"

trainpath = os.path.join(DATAPATH, "hi", "lexicons", "hi"+".translit.sampled.train.tsv")
valpath = os.path.join(DATAPATH, "hi", "lexicons", "hi"+".translit.sampled.dev.tsv")
testpath = os.path.join(DATAPATH, "hi", "lexicons", "hi"+".translit.sampled.test.tsv")
train = pd.read_csv(
    trainpath,
    sep="\t",
    names=["tgt", "src", "count"],
)
val = pd.read_csv(
    valpath,
    sep="\t",
    names=["tgt", "src", "count"],
)
test = pd.read_csv(
    testpath,
    sep="\t",
    names=["tgt", "src", "count"],
)



# create train data
#train_data = preprocess(list(train["src"]), list(train["tgt"]))

In [None]:
source=list(train["src"])
target=list(train["tgt"])

source_chars = set()
target_chars = set()

#removing non str nan types
source = [str(x) for x in source]
target = [str(x) for x in target]

source_words = []
target_words = []

for src, tgt in zip(source, target):
    tgt = "\t" + tgt + "\n"
    source_words.append(src)
    target_words.append(tgt)
    for char in src:
        if char not in source_chars:
            source_chars.add(char)
    for char in tgt:
        if char not in target_chars:
            target_chars.add(char)

source_chars = sorted(list(source_chars))
target_chars = sorted(list(target_chars))

#The space needs to be appended so that the encode function doesn't throw errors
source_chars.append(" ")
target_chars.append(" ")

num_encoder_tokens = len(source_chars)
num_decoder_tokens = len(target_chars)
max_source_length = max([len(txt) for txt in source_words])
max_target_length = max([len(txt) for txt in target_words])

print("Number of samples:", len(source))
print("Source Vocab length:", num_encoder_tokens)
print("Target Vocab length:", num_decoder_tokens)
print("Max sequence length for inputs:", max_source_length)
print("Max sequence length for outputs:", max_target_length)


Number of samples: 44204
Source Vocab length: 27
Target Vocab length: 66
Max sequence length for inputs: 20
Max sequence length for outputs: 21


In [None]:
def dictionary_lookup( vocab):
  char2int = dict([(char, i) for i, char in enumerate(vocab)])
  int2char = dict((i, char) for char, i in char2int.items())
  return char2int, int2char


In [None]:
def encode(source, target, source_chars, target_chars, source_char2int=None, target_char2int=None):
        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source])
        max_target_length = max([len(txt) for txt in target])

        source_vocab, target_vocab = None, None
        if source_char2int == None and target_char2int == None:
            print("Generating the dictionary lookups for character to integer mapping and back")
            source_char2int, source_int2char = dictionary_lookup(source_chars)
            target_char2int, target_int2char = dictionary_lookup(target_chars)

            source_vocab = (source_char2int, source_int2char)
            target_vocab = (target_char2int, target_int2char)

        encoder_input_data = np.zeros(
            (len(source), max_source_length, num_encoder_tokens), dtype="float32"
        )
        decoder_input_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )
        decoder_target_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="float32"
        )

        for i, (input_text, target_text) in enumerate(zip(source, target)):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, source_char2int[char]] = 1.0
            encoder_input_data[i, t + 1 :, source_char2int[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, target_char2int[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[i, t - 1, target_char2int[char]] = 1.0
            decoder_input_data[i, t + 1 :, target_char2int[" "]] = 1.0
            decoder_target_data[i, t:, target_char2int[" "]] = 1.0
        if source_vocab != None and target_vocab != None:
            return (
                encoder_input_data,
                decoder_input_data,
                decoder_target_data,
                source_vocab,
                target_vocab,
            )
        else:
            return encoder_input_data, decoder_input_data, decoder_target_data

In [None]:
train_data=encode(source_words, target_words, source_chars, target_chars)

Generating the dictionary lookups for character to integer mapping and back


In [None]:
 # create train data
(train_encoder_input,
    train_decoder_input,
    train_decoder_target,
    source_vocab,
    target_vocab,
) = train_data
source_char2int, source_int2char = source_vocab
target_char2int, target_int2char = target_vocab


In [None]:
 # create val data (only encode function suffices as the dictionary lookup should be kep the same.
val_data = encode(
    val["src"].to_list(),
    val["tgt"].to_list(),
    list(source_char2int.keys()),
    list(target_char2int.keys()),
    source_char2int=source_char2int,
    target_char2int=target_char2int,
)
val_encoder_input, val_decoder_input, val_decoder_target = val_data
source_char2int, source_int2char = source_vocab
target_char2int, target_int2char = target_vocab

# create test data
test_data = encode(
    test["src"].to_list(),
    test["tgt"].to_list(),
    list(source_char2int.keys()),
    list(target_char2int.keys()),
    source_char2int=source_char2int,
    target_char2int=target_char2int,
)
test_encoder_input, test_decoder_input, test_decoder_target = test_data
source_char2int, source_int2char = source_vocab
target_char2int, target_int2char = target_vocab
