<a href="https://colab.research.google.com/github/mr-cri-spy/NLP-Projects/blob/main/Word_Alignment_(EM_Training)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
from collections import defaultdict

# Sample parallel corpus: English (e) to Foreign (f)
parallel_corpus = [
    (["mango", "is", "sweet"], ["hannu", "sihi"]),
    (["akki", "eats", "mango"], ["akki", "thinnu", "hannu"]),
    (["gopala", "likes", "fruit"], ["gopala", "ishta", "hannu"]),
]

# Initialize translation probabilities t(f|e) uniformly
t = defaultdict(lambda: defaultdict(lambda: 1.0))

# Collect all unique words
eng_vocab = set()
for_vocab = set()
for (e_sentence, f_sentence) in parallel_corpus:
    for e in e_sentence:
        eng_vocab.add(e)
    for f in f_sentence:
        for_vocab.add(f)

# EM algorithm
num_iterations = 10
for iteration in range(num_iterations):
    count = defaultdict(lambda: defaultdict(float))
    total = defaultdict(float)

    # E-step
    for (e_sentence, f_sentence) in parallel_corpus:
        for f in f_sentence:
            Z = sum(t[f][e] for e in e_sentence)
            for e in e_sentence:
                c = t[f][e] / Z
                count[f][e] += c_
                total[e] += c

    # M-step
    for f in for_vocab:
        for e in eng_vocab:
            if total[e] > 0:
                t[f][e] = count[f][e] / total[e]

# Show results
print("\nLearned Translation Probabilities t(f|e):\n")
for f in for_vocab:
    for e in eng_vocab:
        prob = round(t[f][e], 3)
        if prob > 0.01:
            print(f"t({f}|{e}) = {prob}")



Learned Translation Probabilities t(f|e):

t(gopala|likes) = 0.333
t(gopala|gopala) = 0.333
t(gopala|fruit) = 0.333
t(akki|eats) = 0.463
t(akki|akki) = 0.463
t(akki|mango) = 0.019
t(sihi|sweet) = 0.729
t(sihi|is) = 0.729
t(ishta|likes) = 0.333
t(ishta|gopala) = 0.333
t(ishta|fruit) = 0.333
t(thinnu|eats) = 0.463
t(thinnu|akki) = 0.463
t(thinnu|mango) = 0.019
t(hannu|likes) = 0.333
t(hannu|eats) = 0.074
t(hannu|akki) = 0.074
t(hannu|gopala) = 0.333
t(hannu|sweet) = 0.271
t(hannu|mango) = 0.962
t(hannu|is) = 0.271
t(hannu|fruit) = 0.333
