In [20]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/train.tsv", sep='\t')
test_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/test.tsv", sep='\t')
val_df = pd.read_csv("Hinglish-TOP-Dataset-main/Dataset/Human-Annotated-Data/validation.tsv", sep='\t')

In [3]:
train_df.head()

Unnamed: 0,en_query,cs_query,en_parse,cs_parse,domain
0,Add a new weekly reminder for Sunday Brunch at...,9 : 30 am ko Sunday Brunch ke liye ek naya wee...,[IN:CREATE_ALARM Add a new [SL:PERIOD weekly ]...,[IN:CREATE_ALARM [SL:DATE_TIME 9 : 30 am ko ] ...,alarm
1,message danny and see if he wants to go to com...,danny ko message karo aur dekho ke he wants to...,[IN:SEND_MESSAGE message [SL:RECIPIENT danny ]...,[IN:SEND_MESSAGE [SL:RECIPIENT danny ] ko mess...,messaging
2,set alarm for 2 hours,do ghante ke liye alarm set kardo,[IN:CREATE_ALARM set alarm [SL:DATE_TIME for 2...,[IN:CREATE_ALARM [SL:DATE_TIME do ghante ke li...,alarm
3,kill the reminder for baking a cake for neil,neil ke liye cake bake karne ke reminder ko mi...,[IN:DELETE_REMINDER kill the reminder for [SL:...,[IN:DELETE_REMINDER [SL:TODO neil ke liye cake...,reminder
4,retrieve my chat requests please,Please mere chat requests ko retrieve kare,[IN:GET_MESSAGE retrieve my chat requests plea...,[IN:GET_MESSAGE Please mere chat requests ko r...,messaging


In [32]:
train_df[['en_query', 'cs_query']].head()

Unnamed: 0,en_query,cs_query
0,Add a new weekly reminder for Sunday Brunch at...,9 : 30 am ko Sunday Brunch ke liye ek naya wee...
1,message danny and see if he wants to go to com...,danny ko message karo aur dekho ke he wants to...
2,set alarm for 2 hours,do ghante ke liye alarm set kardo
3,kill the reminder for baking a cake for neil,neil ke liye cake bake karne ke reminder ko mi...
4,retrieve my chat requests please,Please mere chat requests ko retrieve kare


In [4]:
# Train Pairs
train_pairs = []

for _, row in train_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    train_pairs.append((eng, hnd))

In [5]:
# Test Pairs
test_pairs = []

for _, row in test_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    test_pairs.append((eng, hnd))

In [6]:
# validation Pairs
val_pairs = []

for _, row in val_df.iterrows():
    eng = row['en_query']
    hnd = row['cs_query']
    hnd = "[start] " + hnd + " [end]"
    val_pairs.append((eng, hnd))

In [7]:
for _ in range(5):
    print(random.choice(test_pairs))

('Delete movie reminder this weekend .', '[start] is weekend ko movie reminder delete karo [end]')
('add this song to my library', '[start] is song ko mere library me add kare [end]')
('Write an SMS', '[start] SMS likhe [end]')
('Pause my timer please', '[start] Please mere timer ko roke [end]')
('Angry face at the Star Wars group post', '[start] Star Wars group post ke liye Angry face [end]')


In [8]:
# random.shuffle(text_pairs)
# num_val_samples = int(0.15 * len(text_pairs))
# num_train_samples = len(text_pairs) - 2 * num_val_samples
# train_pairs = text_pairs[:num_train_samples]
# val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
# test_pairs = text_pairs[num_train_samples + num_val_samples :]

# print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

2993 training pairs
1390 validation pairs
6513 test pairs


In [36]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 5000
sequence_length = 28
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
hnd_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_hnd_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
hnd_vectorization.adapt(train_hnd_texts)
hnd_vectorization.get_vocabulary()

['',
 '[UNK]',
 '[start]',
 '[end]',
 'ke',
 'ko',
 'liye',
 'hai',
 'kya',
 'kare',
 'me',
 'alarm',
 'mujhe',
 'set',
 'mere',
 'se',
 'ki',
 'ka',
 'timer',
 'par',
 'aaj',
 'karo',
 'he',
 'ek',
 'is',
 'kal',
 'traffic',
 'reminder',
 'aur',
 'please',
 'minutes',
 'karen',
 'yaad',
 'mai',
 'karne',
 'baje',
 'message',
 'kitna',
 'tak',
 'mei',
 'pm',
 'hone',
 'alarms',
 'to',
 'time',
 'kaisa',
 'am',
 'raat',
 'the',
 'muje',
 'aj',
 'song',
 'bajhe',
 'subah',
 'chahiye',
 'weather',
 '5',
 'koi',
 '10',
 '6',
 'cancel',
 'weekend',
 'kitni',
 'mausam',
 'kardo',
 'play',
 'abhi',
 'wala',
 'temperature',
 'music',
 'mera',
 'kab',
 '7',
 'shuru',
 'hu',
 'dilaye',
 'band',
 'har',
 'delete',
 'hoga',
 'meri',
 '8',
 'for',
 'do',
 'reminders',
 'paas',
 'kariye',
 'mein',
 'kitne',
 'bajao',
 '30',
 'new',
 'forecast',
 'baarish',
 'agar',
 'pehle',
 'i',
 'friday',
 'add',
 'aap',
 '9',
 'kijiye',
 'hoon',
 'hogi',
 '3',
 '2',
 'at',
 'wali',
 'samay',
 'minute',
 'ho',
 '

In [35]:
['', '[UNK]', '[start]', '[end]', 'ke', 'ko', 'liye', 'hai', 'kya', 'kare', ... ]

['',
 '[UNK]',
 '[start]',
 '[end]',
 'ke',
 'ko',
 'liye',
 'hai',
 'kya',
 'kare',
 Ellipsis]

In [10]:
def format_dataset(eng, hnd):
    eng = eng_vectorization(eng)
    hnd = hnd_vectorization(hnd)
    return ({"encoder_inputs": eng, "decoder_inputs": hnd[:, :-1],}, hnd[:, 1:])


def make_dataset(pairs):
    eng_texts, hnd_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hnd_texts = list(hnd_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hnd_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [11]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 28)
inputs["decoder_inputs"].shape: (64, 28)
targets.shape: (64, 28)


In [12]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
            "embed_dim": self.embed_dim
        })
        return config
    
    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, tf.newaxis, :], dtype="int32")
        
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim
        })
        return config

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "latent_dim": self.latent_dim,
            "num_heads": self.num_heads,
            "embed_dim": self.embed_dim
        })
        return config

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [13]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [14]:
epochs = 50  # This should be at least 30 for convergence

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   1287168     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   3155456     ['positional_embedding[

In [54]:
hnd_vocab = hnd_vectorization.get_vocabulary()
hnd_index_lookup = dict(zip(range(len(hnd_vocab)), hnd_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hnd_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = hnd_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(10):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence)
    print(translated)
    print("\n")

Are there any wine tasting events in St . George
[start] kya dallas me koi sabse tez rasta kaunsa hai [end]


Is the timer almost done ?
[start] kya timer abhi se start kare [end]


Can you delete my reminder to take out the trash today .
[start] kya aap mere aaj raat ko delete karsakte hai [end]


Should I bring a poncho this morning
[start] kya aaj subah 8 bajhe dog ka remind karaye [end]


Remove all reminders regarding the schedule of games for the Chicago Bears ' 2018 season .
[start] chicago mei sabhi reminders ko hata do ko hata den jo 5 baje ka driving directions remove kar do [end]


Switch off alarm
[start] alarm band karo [end]


Stop all alarms
[start] sabhi alarms ko stop kare [end]


Message mom do you want a new bag ?
[start] mom ko message karo ke n naya thanda rahega [end]


Is it warm outside ?
[start] kya bahar garm hai [end]


Scrap tomorrow ' s alarm .
[start] kal ke alarm ko snooze kare [end]




In [None]:
transformer.save('hinglish.h5')