<a href="https://colab.research.google.com/github/mithunkumarsr/NLPNov21/blob/main/SMT_English_to_Hindi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Statistical Machine Translation System
# English to Hindi

# IBM Model 1 for Word Translation Task
# Word Alignment based on Relative Positions
# Bi-gram Language Modelling with Laplace Smoothing and Backoff

In [None]:
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenized_stores = {'en_train': [], 'en_dev': [], 'en_test': [], 'hi_train': [], 'hi_dev': [], 'hi_test': []}

In [None]:
# Load data files into your Google Drive in a directory named "NLP_Translation"
# Alternatively, provide location to the folder 'data_file'

for key in tokenized_stores:
    file_name = "/content/drive/My Drive/NLP_Translation/" + str(key)[3:] + "." + str(key)[0:2]
    load = open(file_name)
    sentences = load.read().split('\n')
    
    for sentence in sentences:
        token_store = sentence.split(' ')
        tokenized_stores[key].append(token_store)

In [None]:
print(tokenized_stores['hi_train'][2])

['ऑपरेशन', 'के', 'दौरान', 'लैन्स', 'प्रत्यारोपण', 'आँख', 'के', 'अगले', 'भाग', ',', 'आइरिस', 'के', 'आगे', 'किया', 'जाता', 'है', '।']


In [None]:
train_size = len(tokenized_stores['en_train'])
dev_size = len(tokenized_stores['en_dev'])
test_size = len(tokenized_stores['en_test'])

In [None]:
# making the vocabulary

en_words = {}
hi_words = {}

for key in tokenized_stores:
    if str(key)[0] == 'e':
        # creating en_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in en_words:
                    en_words[word] += 1
                else:
                    en_words[word] = 1
    else:
        # creating hi_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in hi_words:
                    hi_words[word] += 1
                else:
                    hi_words[word] = 1
                    
en_vocab = len(en_words)
hi_vocab = len(hi_words)
print("Number of Unique Words:")
print("> English:", str(en_vocab))
print("> Hindi:", str(hi_vocab))

Number of Unique Words:
> English: 36879
> Hindi: 43921


In [None]:
# creating the 't'
t = {}
# usage: t[('EN_word', 'HI_word')] = probability of EN_Word given HI_word
uniform = 1 / (en_vocab * hi_vocab)

In [None]:
n_iters = 0
max_iters = 25

fine_tune = 1
has_converged = False

while n_iters < max_iters and has_converged == False:
    has_converged = True
    max_change = -1

    n_iters += 1
    count = {}
    total = {}
    for index in range(train_size):
        s_total = {}
        for en_word in tokenized_stores['en_train'][index]:
            s_total[en_word] = 0
            for hi_word in tokenized_stores['hi_train'][index]:
                if (en_word, hi_word) not in t:
                    t[(en_word, hi_word)] = uniform
                s_total[en_word] += t[(en_word, hi_word)]

        for en_word in tokenized_stores['en_train'][index]:
            for hi_word in tokenized_stores['hi_train'][index]:
                if (en_word, hi_word) not in count:
                    count[(en_word, hi_word)] = 0
                count[(en_word, hi_word)] += (t[(en_word, hi_word)] / s_total[en_word])

                if hi_word not in total:
                    total[hi_word] = 0
                total[hi_word] += (t[(en_word, hi_word)] / s_total[en_word])

    # estimating the probabilities

    if fine_tune == 0:
      updated = {}
      # train for all valid word pairs s.t count(en_word, hi_word) > 0
      for index in range(train_size):
          for hi_word in tokenized_stores['hi_train'][index]:
              for en_word in tokenized_stores['en_train'][index]:
                  if (en_word, hi_word) in updated:
                      continue
                  updated[(en_word, hi_word)] = 1
                  if abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]) > 0.01:
                      has_converged = False
                      max_change = max(max_change, abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]))
                  t[(en_word, hi_word)] = count[(en_word, hi_word)] / total[hi_word]

    elif fine_tune == 1:
      # train it only for 1000 most frequent words in English and Hindi
      max_words = 1000
      n_hi_words = 0
      updates = 0

      for hi_word_tuples in sorted(hi_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
          hi_word = hi_word_tuples[0]
          n_hi_words += 1
          if n_hi_words > max_words:
              break
          n_en_words = 0
          for en_word_tuples in sorted(en_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
              en_word = en_word_tuples[0]
              n_en_words += 1
              if n_en_words > max_words:
                  break
              if (en_word, hi_word) not in count or hi_word not in total:
                  continue
                  # assume in this case: t[(en_word, hi_word)] = uniform
              else:
                  if abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]) > 0.005:
                      has_converged = False
                      max_change = max(max_change, abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]))
                  t[(en_word, hi_word)] = count[(en_word, hi_word)] / total[hi_word]

    print("Iteration " + str(n_iters) + " Completed, Maximum Change: " + str(max_change))


Iteration 1 Completed, Maximum Change: 0.12702983945877983
Iteration 2 Completed, Maximum Change: 0.37839562630629314
Iteration 3 Completed, Maximum Change: 0.217407035860872
Iteration 4 Completed, Maximum Change: 0.13005997455980178
Iteration 5 Completed, Maximum Change: 0.08057651269471866
Iteration 6 Completed, Maximum Change: 0.04856997020110787
Iteration 7 Completed, Maximum Change: 0.03555362770400777
Iteration 8 Completed, Maximum Change: 0.029406614861381575
Iteration 9 Completed, Maximum Change: 0.02457095418019195
Iteration 10 Completed, Maximum Change: 0.020788408537065484
Iteration 11 Completed, Maximum Change: 0.018719192726659395
Iteration 12 Completed, Maximum Change: 0.01647236630228205
Iteration 13 Completed, Maximum Change: 0.014208922903773125
Iteration 14 Completed, Maximum Change: 0.012183247164818
Iteration 15 Completed, Maximum Change: 0.01070533230776105
Iteration 16 Completed, Maximum Change: 0.009634828804602424
Iteration 17 Completed, Maximum Change: 0.008727

In [None]:
# displaying the most confident translation pairs
limit = 40
for element in sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True):
  print(element)
  limit -= 1
  if limit <= 0:
    break

(('or', 'या'), 0.7428089454650831)
(('and', 'तथा'), 0.7336960929966874)
(('and', 'और'), 0.7189563724564298)
(('and', 'एवं'), 0.7183124504420993)
(('and', 'व'), 0.7170500220588348)
((',', ','), 0.6962109618976713)
(('oil', 'तेल'), 0.6884574558949705)
(('30', '30'), 0.6773895694061998)
(('body', 'शरीर'), 0.6755868397968229)
(('Shimla', 'शिमला'), 0.6750139366454307)
(('water', 'पानी'), 0.6602254786077301)
(('chest', 'छाती'), 0.6591551695558174)
(('Delhi', 'दिल्ली'), 0.6563478755134947)
(('12', '12'), 0.6516055421315546)
(('skin', 'त्वचा'), 0.6493576943448509)
(('this', 'इस'), 0.6492213540524291)
(('20', '20'), 0.6399123466762006)
(('children', 'बच्चों'), 0.6399055603891741)
(('people', 'लोगों'), 0.6384099457254119)
(('milk', 'दूध'), 0.6271768274872059)
(('heart', 'हृदय'), 0.6271654259958892)
(('fever', 'बुखार'), 0.6269890380285558)
(('other', 'अन्य'), 0.6256219920564763)
(('stomach', 'पेट'), 0.6170447717986453)
(('disease', 'बीमारी'), 0.6152551231622909)
(('patients', 'रोगियों'), 0.610676

In [None]:
# saving the translation model
file = open("translation_model.pkl","wb")
pickle.dump(t, file)
file.close()

In [None]:
# using the model trained until convergence
# to use a saved model
model_name = "translation_model.pkl"
pickle_in = open(model_name,"rb")
t = pickle.load(pickle_in)

In [None]:
I = {}
for index in range(train_size):
    for en_id in range(len(tokenized_stores['en_train'][index])):
        length = len(tokenized_stores['en_train'][index])
        if length not in I:
            I[length] = {} # maps the positional difference to a tuple: (sum of t's, count)
        for hi_id in range(len(tokenized_stores['hi_train'][index])):
            if (hi_id - en_id) not in I[length]:
                I[length][(hi_id - en_id)] = [t[(tokenized_stores['en_train'][index][en_id], tokenized_stores['hi_train'][index][hi_id])], 1]
            else:
                I[length][(hi_id - en_id)][0] += t[(tokenized_stores['en_train'][index][en_id], tokenized_stores['hi_train'][index][hi_id])]
                I[length][(hi_id - en_id)][1] += 1

In [None]:
# viewing the available sentence lengths encountered during training
sentence_lengths = []
for key in I.keys():
    if key not in sentence_lengths:
        sentence_lengths.append(key)
sentence_lengths.sort()
print(sentence_lengths)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 79, 80, 83, 93, 96, 100, 107]


In [None]:
# computing the alignment probabilities
# p[I][hi_id - en_id] = p(i | i', I)

p = {}
for key in I.keys():
    p[key] = {}
    sum_val = 0
    for diff in I[key].keys():
        p[key][diff] = I[key][diff][0] / I[key][diff][1]
        sum_val += p[key][diff]
    for diff in p[key].keys():
        p[key][diff] /= sum_val

In [None]:
for index in range(train_size):
    length_en = len(tokenized_stores['en_train'][index])
    length_hi = len(tokenized_stores['hi_train'][index])
    if length_hi - length_en > 10 and length_en == 1:
        print("Length of English Sentence:", str(length_en))
        print("Length of Hindi Sentence:", str(length_hi))
        
# there exists an English sentence with one token s.t the Hindi translation contains 19 tokens

Length of English Sentence: 1
Length of Hindi Sentence: 19


In [None]:
# computing initial transitions
init = {}
for length in p:
    max_prob = -1
    max_jump = 0
    for key in p[length].keys():
        if p[length][key] > max_prob:
            max_prob = p[length][key]
            max_jump = key
    init[length] = max_jump

In [None]:
!pip install nltk



In [None]:
# computing the transition probabilities for Hindi
bigrams = {}
unigrams = {}

# training on the train_set
def model(dataset_size, dataset_name):
    global bigrams
    global unigrams
    for index in range(dataset_size):
        token_A = ''
        for hi_token in tokenized_stores[dataset_name][index]:
            if hi_token not in unigrams:
                unigrams[hi_token] = 1
            else:
                unigrams[hi_token] += 1
            
            token_B = hi_token
            if (token_A, token_B) not in bigrams:
                bigrams[(token_A, token_B)] = 1
            else:
                bigrams[(token_A, token_B)] += 1
            token_A = token_B

model(train_size, 'hi_train')
model(dev_size, 'hi_dev')

bigram_count = len(bigrams)
unigram_count = len(unigrams)
print("Number of Unique Bigrams:", bigram_count)
print("Number of Unique Unigrams:", unigram_count)

Number of Unique Bigrams: 317170
Number of Unique Unigrams: 43851


In [None]:
from itertools import permutations
import nltk

computed_sentences = []
total_BLEU = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 7: 0}
null_BLEU_count = 0

sorted_t = sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True)

def find_translation(en_token):
    for element in sorted_t:
        if element[0][0].lower() == en_token:
            return element[0][1]
    return ""

def get_prob(seq):
    # bigram language model with laplace smoothing and backoff
    if len(seq) < 2:
        return 1
    score = 0
    token_A = ''
    for hi_token in seq:
        token_B = hi_token
        if (token_A, token_B) not in bigrams:
            if token_B not in unigrams:
                continue
            else:
                score += unigrams[token_B] / unigram_count
        else:
            base_token_count = 0
            if token_A in unigrams:
                base_token_count = unigrams[token_A]
            score += (bigrams[(token_A, token_B)] + 1) / (base_token_count + unigram_count)
        token_A = token_B
    return score

count = 0
for index in range(test_size):
    if len(tokenized_stores['en_test'][index]) > 8 or len(tokenized_stores['en_test'][index]) < 2:
        continue

    translated_words = []
    for en_token in tokenized_stores['en_test'][index]:
        translation = find_translation(en_token)
        if translation != "":
            translated_words.append(translation)

    perm = permutations(translated_words)

    best_seq = translated_words
    best_prob = -1

    for seq in perm:
        prob = get_prob(seq)
        if prob > best_prob:
            best_prob = prob
            best_seq = seq

    BLEU_scores = []
    # Collecting BLEU_scores with various kinds of Smoothing
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method3))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method5))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['hi_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method7))

    for key in total_BLEU.keys():
        if key == 7:
            consider = 5
        else: consider = key - 1
        total_BLEU[key] += BLEU_scores[consider]
    
    if BLEU_scores[0] == 0:
        null_BLEU_count += 1
    
    count += 1
    print("Sentence Index: ", str(count))
    print("English Sentence:", str(tokenized_stores['en_test'][index]))
    print("Reference Hindi Sentence:", str(tokenized_stores['hi_test'][index]))
    print("Translated Sentence:", str(best_seq))
    print("Translation BLEU Scores", str(BLEU_scores))
    print()
    
    computed_sentences.append([tokenized_stores['en_test'][index], tokenized_stores['hi_test'][index], best_seq, BLEU_scores])

tested = count

Sentence Index:  1
English Sentence: ['Your', 'self-confidence', 'also', 'increases', 'with', 'teeth', '.']
Reference Hindi Sentence: ['दाँतों', 'से', 'आपका', 'आत्मविश्\u200dवास', 'भी', 'बढ़ता', 'है', '।']
Translated Sentence: ('।', 'बढ़', 'साथ', 'दाँतों', '।', 'भी')
Translation BLEU Scores [0.03849815007763549, 0.18822631894109965, 0.07654112967106118, 0.18815926093992244, 0.09193101000946054, 0.2362891668915472]

Sentence Index:  2
English Sentence: ['Bacteria', 'stay', 'between', 'our', 'gums', 'and', 'teeth', '.']
Reference Hindi Sentence: ['हमारे', 'मसूढ़ों', 'और', 'दाँतों', 'के', 'बीच', 'बैक्टीरिया', 'मौजूद', 'होते', 'हैं', '।']
Translated Sentence: ('।', 'ठहरने', 'बीच', 'हमारे', '’', 'दाँतों', 'तथा')
Translation BLEU Scores [0.02638012815011716, 0.13190064075058583, 0.05244835934727967, 0.15903757829743673, 0.07762860946605708, 0.19734686260161405]

Sentence Index:  3
English Sentence: ['They', 'make', 'teeth', 'dirty', 'and', 'breath', 'stinky', '.']
Reference Hindi Sentence: ['

In [None]:
# Results:
import statistics
print("Number of Samples Tested Upon: " + str(tested))
print()

print("Average BLEU Score using Various Smoothing Functions (considering all test samples)")
for key in total_BLEU:
    print("Method " + str(key) + ": " + str(total_BLEU[key] / tested))
print()
print("Average BLEU Score using Various Smoothing Functions (considering test samples with at-least one word overlap)")
for key in total_BLEU:
    print("Method " + str(key) + ": " + str(total_BLEU[key] / (tested - null_BLEU_count)))

Number of Samples Tested Upon: 50

Average BLEU Score using Various Smoothing Functions (considering all test samples)
Method 1: 0.04045334190563911
Method 2: 0.18790274652264405
Method 3: 0.08042839674590069
Method 4: 0.15545775044775517
Method 5: 0.07580429833529106
Method 7: 0.2014522086114808

Average BLEU Score using Various Smoothing Functions (considering test samples with at-least one word overlap)
Method 1: 0.04127892031187664
Method 2: 0.1917374964516776
Method 3: 0.08206979259785784
Method 4: 0.15863035759975017
Method 5: 0.07735132483192965
Method 7: 0.2055634781749804


In [None]:
# ^_^ Thank You