In [1]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import pandas as pd



In [2]:
train_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/train.tsv", sep='\t')
test_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/test.tsv", sep='\t')
val_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/validation.tsv", sep='\t')

In [3]:
train_df.head()

Unnamed: 0,en_query,cs_query,en_parse,cs_parse,domain
0,Add a new weekly reminder for Sunday Brunch at...,9 : 30 am ko Sunday Brunch ke liye ek naya wee...,[IN:CREATE_ALARM Add a new [SL:PERIOD weekly ]...,[IN:CREATE_ALARM [SL:DATE_TIME 9 : 30 am ko ] ...,alarm
1,message danny and see if he wants to go to com...,danny ko message karo aur dekho ke he wants to...,[IN:SEND_MESSAGE message [SL:RECIPIENT danny ]...,[IN:SEND_MESSAGE [SL:RECIPIENT danny ] ko mess...,messaging
2,set alarm for 2 hours,do ghante ke liye alarm set kardo,[IN:CREATE_ALARM set alarm [SL:DATE_TIME for 2...,[IN:CREATE_ALARM [SL:DATE_TIME do ghante ke li...,alarm
3,kill the reminder for baking a cake for neil,neil ke liye cake bake karne ke reminder ko mi...,[IN:DELETE_REMINDER kill the reminder for [SL:...,[IN:DELETE_REMINDER [SL:TODO neil ke liye cake...,reminder
4,retrieve my chat requests please,Please mere chat requests ko retrieve kare,[IN:GET_MESSAGE retrieve my chat requests plea...,[IN:GET_MESSAGE Please mere chat requests ko r...,messaging


In [4]:
# Train Pairs
train_pairs = []

for _, row in train_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    train_pairs.append((eng, hnd))

In [5]:
# Test Pairs
test_pairs = []

for _, row in test_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    test_pairs.append((eng, hnd))

In [6]:
# validation Pairs
val_pairs = []

for _, row in val_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    val_pairs.append((eng, hnd))

In [7]:
for _ in range(5):
    print(random.choice(test_pairs))

('parties in san francisco during the month of december', '[start] san francisco me december ke mahine ke dauran parties [end]')
('What churches are having Christmas Eve services near me', '[start] mere paas kaunse churches mei Christmas Eve services hai [end]')
('What is the high temperature for today', '[start] aaj ke liye zyada temperature kya hai [end]')
('Delete my timer', '[start] Mere timer ko delete kare [end]')
('Will it drop below freezing tonight', '[start] Kya ye aaj raat freezing se niche gir jayega [end]')


In [8]:
# random.shuffle(text_pairs)
# num_val_samples = int(0.15 * len(text_pairs))
# num_train_samples = len(text_pairs) - 2 * num_val_samples
# train_pairs = text_pairs[:num_train_samples]
# val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
# test_pairs = text_pairs[num_train_samples + num_val_samples :]

# print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

2993 training pairs
1390 validation pairs
6513 test pairs


In [9]:
class CustomTextVector():
    def __init__(self, max_tokens, encoder='word', output_sequence_length=None, add_sequence_length=0, padding='post', standardization=None):
        from tensorflow.keras.preprocessing.text import Tokenizer
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        from tokenizers import CharBPETokenizer
        self.sequence_length = output_sequence_length
        self.add_sequence_length = add_sequence_length
        
        self.encoder = encoder
        
        if encoder == 'word':
            self.tokenizer = Tokenizer(num_words=max_tokens, oov_token='<OOV>', filters='!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
        elif encoder == 'CharBPE':
            self.tokenizer = CharBPETokenizer()

        self.padding = padding
        self.pad_sequences = pad_sequences
        self.standardization = standardization
  
    def adapt(self, data):
        if self.standardization != None:
            data = map(self.standardization, data)
            
        if self.encoder == 'word':
            self.tokenizer.fit_on_texts(data)
            self.word_index = self.tokenizer.word_index
            
        elif self.encoder == 'CharBPE':
            self.tokenizer.train_from_iterator(data)
            self.word_index = self.tokenizer.get_vocab()

    def __call__(self, sentence):
        if self.encoder == 'word':
            self.encoded = self.tokenizer.texts_to_sequences(sentence)
            result = self.pad_sequences(encoded, maxlen = self.sequence_length, padding=self.padding, value=0.0)
            
        elif self.encoder == 'CharBPE':
            self.encoded_batch = self.tokenizer.encode_batch(sentence)
            result = [encoded.ids for encoded in self.encoded_batch]
            result = self.pad_sequences(result, maxlen = self.sequence_length, padding=self.padding)
        
        return result

In [10]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 5000
sequence_length = 56
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(str(input_string))
    result = tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
    result = result.numpy().decode()
    return result


# eng_vectorization = CustomTextVector(
#     max_tokens=vocab_size, encoder='CharBPE', output_sequence_length=sequence_length
# )
hin_vectorization = CustomTextVector(
    max_tokens=vocab_size,
    encoder='CharBPE', output_sequence_length=sequence_length + 1,
    standardization=custom_standardization
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_hin_texts = [pair[1] for pair in train_pairs]
hin_vectorization.adapt([train_eng_texts, train_hin_texts])
# hin_vectorization.adapt(train_hin_texts)

In [11]:
hin_vectorization.tokenizer.add_tokens(['[start]', '[end]'])

2

In [12]:
hin_vectorization(['hello kese ho are you'])

array([[4083,  376,  114,  521,  252,  352,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0]])

In [13]:
hin_vectorization.tokenizer.decode([4083,  376,  114,  521,  252,  352,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0])

'hello kese ho are you'

In [14]:
a = np.array([[1,2,3,4], [5,6,7,8]])
a[:,:-1]

array([[1, 2, 3],
       [5, 6, 7]])

In [15]:
def format_dataset(eng, hin):
    return ({"encoder_inputs": eng[:,:-1], "decoder_inputs": hin[:, :-1],}, hin[:, 1:])

def sentence_to_tokens(pair):
    eng = hin_vectorization(pair[0])
    hin = hin_vectorization(pair[1])
    return (eng, hin)
   
def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    data = (eng_texts, hin_texts)
    data = sentence_to_tokens(data)
    dataset = tf.data.Dataset.from_tensor_slices((data[0], data[1]))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [16]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 56)
inputs["decoder_inputs"].shape: (64, 56)
targets.shape: (64, 56)


In [17]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["decoder_inputs"]: {inputs["decoder_inputs"]}')

inputs["decoder_inputs"]: [[4711 2228  105 ...    0    0    0]
 [4711  242  622 ...    0    0    0]
 [4711  476  569 ...    0    0    0]
 ...
 [4711  284 2951 ...    0    0    0]
 [4711 1183  343 ...    0    0    0]
 [4711  242   54 ...    0    0    0]]


In [18]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
            "embed_dim": self.embed_dim
        })
        return config
    
    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim
        })
        return config

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "latent_dim": self.latent_dim,
            "num_heads": self.num_heads,
            "embed_dim": self.embed_dim
        })
        return config

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [19]:
embed_dim = 256
latent_dim = 4096
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [20]:
hin_vectorization.tokenizer.get_vocab_size()

4713

In [21]:
hin_vectorization.tokenizer.get_vocab_size()

4713

In [22]:
epochs = 50  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   1294336     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   4206080     ['positional_embedding[

<keras.callbacks.History at 0x241d4f41fd0>

In [23]:
hin_vocab = hin_vectorization.word_index
hin_index_lookup = dict(zip(range(1,len(hin_vocab)), hin_vocab))

max_decoded_sentence_length = 56

hin_index_lookup[0] = ""

def decode_sequence(input_sentence):
    tokenized_input_sentence = np.array(hin_vectorization([input_sentence]))
#     print("tokenized_input_sentence: ", tokenized_input_sentence[0])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]
#         print("tokenized_target_sentence: ", tokenized_target_sentence)
        predictions = transformer([tokenized_input_sentence[:,:-1], tokenized_target_sentence])
#         print(predictions)
        sampled_token_index = np.argmax(predictions[0, i, :])
#         sampled_token = spa_index_lookup[sampled_token_index]
        sampled_token = hin_vectorization.tokenizer.decode([sampled_token_index])
#         print(sampled_token)
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)
    print("                                        ")

# input_sentence = "I ran"
# translated = decode_sequence(input_sentence)
# print(input_sentence)
print(translated)

Tell me about the annual Tangle Town garage sale that 's coming up soon
[start] u mein weather ke baare me hur rist hai day s jo night house u ko ke 2019 like baare d [end]
                                        
Play all Kenny Chesney
[start] l ko play karo ki go to sleep [end]
                                        
abolish all alarms that I have made
[start] mere saare set kiye huwe alarms ko nikaal do [end]
                                        
Play Creed ' s new album
[start] ek naya album bajao [end]
                                        
What ' s the traffic report for Portland
[start] or may ke liye mausam kaisa hai [end]
                                        
remind me to cancel all monthly automatic payments that i have set up
[start] mujhe yaad dilaayen ki is set mujhe ek ghante mei kitna time mujhe mere paas pehle [end]
                                        
What is the traffic like from Pensacola to Gulf Breeze
[start] the family group a traffic kaisa hai [end]


In [29]:
hin_vocab = hin_vectorization.word_index
hin_index_lookup = dict(zip(range(1,len(hin_vocab)), hin_vocab))

max_decoded_sentence_length = 56

hin_index_lookup[0] = ""

def decode_sequence(input_sentence):
    tokenized_input_sentence = np.array(hin_vectorization([input_sentence]))
#     print("tokenized_input_sentence: ", tokenized_input_sentence[0])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]
#         print("tokenized_target_sentence: ", tokenized_target_sentence)
        predictions = transformer([tokenized_input_sentence[:,:-1], tokenized_target_sentence])
#         print(predictions)
        sampled_token_index = np.argmax(predictions[0, i, :])
#         sampled_token = spa_index_lookup[sampled_token_index]
        sampled_token = hin_vectorization.tokenizer.decode([sampled_token_index])
#         print(sampled_token)
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


# test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(train_eng_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)
    print("                                        ")

# input_sentence = "parties in san francisco during the month of december"
# translated = decode_sequence(input_sentence)
# print(input_sentence)
# print(translated)

Remind me of my pilates class at 6 : 00 pm
[start] mujhe 6 00 pm ko meri pilates class ke baare me yaad dilaye [end]
                                        
Driving time for Phoenix to LA
[start] ho raha hu toh mai ho subah ke liye driving time ki zaroorat hai [end]
                                        
Add 2 minutes to the timer
[start] timer me 2 minutes add karen [end]
                                        
Delete the birthday party reminder .
[start] birthday party reminder ko hata do [end]
                                        
Replay the last song
[start] pichla gaana fir se the play kare [end]
                                        
Remind me to take 5 min break at 4 : 30 pm
[start] mujhe 4 30 pm ko 5 minute ka break lene ke liye yaad dilaye [end]
                                        
set alarm for 8 am
[start] subah 8 bajhe ke liye alarm set kare [end]
                                        
when is traffic lightest driving from here to florida
[start] a se a se tr