# Assignment 5 - Natural Language Processing

- Student 1 - Luca Modica
- Student 2 - Hugo Alves Henriques E Silva

---

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math

sns.set_style()
%matplotlib inline

## Reading data

In [74]:
from collections import Counter
import re

# Paths to the files
de_file_path = 'dat410_europarl/europarl-v7.de-en.lc.de'
en_de_file_path = 'dat410_europarl/europarl-v7.de-en.lc.en'
fr_file_path = 'dat410_europarl/europarl-v7.fr-en.lc.fr'
en_fr_file_path = 'dat410_europarl/europarl-v7.fr-en.lc.en'
sv_file_path = 'dat410_europarl/europarl-v7.sv-en.lc.sv'
en_sv_file_path = 'dat410_europarl/europarl-v7.sv-en.lc.en'

## Warmup

In [75]:
# Function to read a file and return word frequencies
def get_word_frequencies(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  # Ensure all text is lowercase
        words = re.findall(r'\b\w+\b', text)  # Extract words
        word_freq = Counter(words)  # Count word frequencies
    return word_freq


In [76]:
# Get word frequencies for German-English pair
de_word_freq = get_word_frequencies(de_file_path)
en_de_word_freq = get_word_frequencies(en_de_file_path)

# Print the 10 most common words in German and English (German-English pair)
de_common_words = de_word_freq.most_common(10)
en_de_common_words = en_de_word_freq.most_common(10)

# Get word frequencies for French-English pair
fr_word_freq = get_word_frequencies(fr_file_path)
en_fr_word_freq = get_word_frequencies(en_fr_file_path)

# Get word frequencies for Swedish-English pair
sv_word_freq = get_word_frequencies(sv_file_path)
en_sv_word_freq = get_word_frequencies(en_sv_file_path)

# Print the 10 most common words in French, English (French-English pair), Swedish, and English (Swedish-English pair)
fr_common_words = fr_word_freq.most_common(10)
en_fr_common_words = en_fr_word_freq.most_common(10)
sv_common_words = sv_word_freq.most_common(10)
en_sv_common_words = en_sv_word_freq.most_common(10)

print("Most common words in German:", de_common_words)
print("Most common words in English (German-English pair):", en_de_common_words)
print("Most common words in French:", fr_common_words)
print("Most common words in English (French-English pair):", en_fr_common_words)
print("Most common words in Swedish:", sv_common_words)
print("Most common words in English (Swedish-English pair):", en_sv_common_words)

Most common words in German: [('die', 10521), ('der', 9374), ('und', 7028), ('in', 4175), ('zu', 3169), ('den', 2976), ('wir', 2863), ('daß', 2738), ('ich', 2670), ('das', 2669)]
Most common words in English (German-English pair): [('the', 19853), ('of', 9633), ('to', 9069), ('and', 7307), ('in', 6278), ('is', 4478), ('that', 4441), ('a', 4438), ('we', 3372), ('this', 3362)]
Most common words in French: [('apos', 16729), ('de', 14528), ('la', 9746), ('et', 6620), ('l', 6536), ('le', 6177), ('à', 5588), ('les', 5587), ('des', 5232), ('que', 4797)]
Most common words in English (French-English pair): [('the', 19627), ('of', 9534), ('to', 8992), ('and', 7214), ('in', 6197), ('is', 4453), ('that', 4421), ('a', 4388), ('we', 3341), ('this', 3332)]
Most common words in Swedish: [('att', 9181), ('och', 7038), ('i', 5954), ('det', 5687), ('som', 5028), ('för', 4959), ('av', 4013), ('är', 3840), ('en', 3724), ('vi', 3211)]
Most common words in English (Swedish-English pair): [('the', 19327), ('o

In [77]:
# Calculate the total word counts and the counts for 'speaker' and 'zebra' across all English files
total_words = sum(en_de_word_freq.values()) + sum(en_fr_word_freq.values()) + sum(en_sv_word_freq.values())
speaker_count = en_de_word_freq['speaker'] + en_fr_word_freq['speaker'] + en_sv_word_freq['speaker']
zebra_count = en_de_word_freq['zebra'] + en_fr_word_freq['zebra'] + en_sv_word_freq['zebra']

# Calculate probabilities
prob_speaker = speaker_count / total_words
prob_zebra = zebra_count / total_words

print("Total words:", total_words)
print("Speaker count:", speaker_count)
print("Zebra count:", zebra_count)
print("Probability of 'speaker':", prob_speaker)
print("Probability of 'zebra':", prob_zebra)


Total words: 784458
Speaker count: 33
Zebra count: 0
Probability of 'speaker': 4.206726172720528e-05
Probability of 'zebra': 0.0


## Language modeling

In [78]:
from nltk.tokenize import word_tokenize
from nltk.util import bigrams

# Function to tokenize corpus into bigrams with start and end tokens
def create_bigrams(text):
    sentences = text.split('\n')
    bigram_list = []
    for sentence in sentences:
        tokens = ['<START>'] + word_tokenize(sentence)
        bigram_list.extend(list(bigrams(tokens)))
    return bigram_list

# Read the English text files from all three pairs to create a single corpus
corpus_de_en = open(en_de_file_path, 'r', encoding='utf-8').read()
corpus_fr_en = open(en_fr_file_path, 'r', encoding='utf-8').read()
corpus_sv_en = open(en_sv_file_path, 'r', encoding='utf-8').read()

# Combine the corpora
combined_corpus = '\n'.join([corpus_de_en, corpus_fr_en, corpus_sv_en])

# Create bigrams from the combined corpus
bigram_list = create_bigrams(combined_corpus)

# Calculate bigram and unigram counts
unigram_counts = Counter([unigram for bigram in bigram_list for unigram in bigram])
bigram_counts = Counter(bigram_list)

# Function to calculate bigram probabilities using MLE
def calculate_bigram_prob(bigram):
    return bigram_counts[bigram] / unigram_counts[bigram[0]]

# Test the function with an example bigram
example_bigram = ('<START>', 'the')
print("Probability of", example_bigram, ":", calculate_bigram_prob(example_bigram))
example_bigram = ('the', 'zebra')
print("Probability of", example_bigram, ":", calculate_bigram_prob(example_bigram))

Probability of ('<START>', 'the') : 0.11426666666666667
Probability of ('the', 'zebra') : 0.0


In [79]:
def calculate_sentence_prob(sentence):
    sentence_bigram_list = create_bigrams(sentence)
    probability = 1
    for bigram in sentence_bigram_list:
        probability *= calculate_bigram_prob(bigram)
    return probability


print(
    f'Probability of "why are no-smoking areas not enforced ?": {calculate_sentence_prob("why are no-smoking areas not enforced ?")}')
print(
    f'Probability of "the door is green": {calculate_sentence_prob("the door is green")}')
print(
    f'Probability of "we pass": {calculate_sentence_prob("we pass")}')

Probability of "why are no-smoking areas not enforced ?": 7.157743716417319e-18
Probability of "the door is green": 0.0
Probability of "we pass": 2.0531400966183576e-05


When we encounter a word that did not appear in the training texts, this will result in a probability of zero for any bigram containing this word, making the probability of the entire sentence zero. This is a common issue in language modeling known as the zero-probability problem, and it can be handled using techniques like Laplace (add-one) smoothing.

If the sentence is very long, the probability of the sentence will tend to be very small due to the multiplication of probabilities, which can lead to underflow problems in computers. One way to handle this is by working with the log probabilities instead of the raw probabilities.

In [80]:
# Calculate the vocabulary size
vocabulary_size = len(unigram_counts)

# Function to calculate bigram probabilities using Laplace smoothing
def calculate_bigram_log_prob_with_laplace(bigram):
    numerator = bigram_counts[bigram] + 1  # Add one to the count for Laplace smoothing
    denominator = unigram_counts[bigram[0]] + vocabulary_size  # Add vocabulary size for Laplace smoothing
    return math.log(numerator) - math.log(denominator)


#calculate probability of a sentence
def calculate_sentence_prob_improved(sentence):
    tokens = ['<START>'] + word_tokenize(sentence.lower())
    probability = 0
    for i in range(len(tokens) - 1):
        bigram = (tokens[i], tokens[i + 1])
        probability += calculate_bigram_log_prob_with_laplace(bigram)
    return probability


print(
    f'Log probability of "why are no-smoking areas not enforced ?": {calculate_sentence_prob_improved("why are no-smoking areas not enforced ?")}')
print(
    f'Log probability of "the door is open": {calculate_sentence_prob_improved("the door is open")}')
print(
    f'Log probability of "the door is green": {calculate_sentence_prob_improved("the door is green")}')
print(
    f'Log probability of "we pass": {calculate_sentence_prob_improved("we pass")}')


Log probability of "why are no-smoking areas not enforced ?": -57.44415862256026
Log probability of "the door is open": -29.796067530994655
Log probability of "the door is green": -31.741977680049967
Log probability of "we pass": -11.416178508839266


The more negative a log probability is, the less likely the sentence is.

## Translation modeling

In [81]:
import string

def tokenize_corpus(corpus, add_null=False):
    """Tokenize the input corpus (a list of sentences) into a list of lists of tokens.
    Optionally add a NULL token at the beginning of each sentence."""
    clean_corpus = [sentence.translate(str.maketrans(
        '', '', string.punctuation)) for sentence in corpus]
    tokenized_corpus = [sentence.lower().split() for sentence in clean_corpus]
    if add_null:
        for sentence in tokenized_corpus:
            sentence.insert(0, "<NULL>")
    return tokenized_corpus


def initialize_translation_prob(corpus_english, corpus_foreign):
    """Initialize translation probabilities with a lower probability for NULL."""

    word_correspondence = {}

    for sentence_e, sentence_f in zip(corpus_english, corpus_foreign):
        for word_e in sentence_e:
            if word_e not in word_correspondence:
                word_correspondence[word_e] = []
            for word_f in sentence_f:
                if word_f not in word_correspondence[word_e]:
                    word_correspondence[word_e] += [word_f]


    translation_prob = {}
    null_prob = 0.00001

    # print(corpus_foreign)


    for word_e in word_correspondence:
        for word_f in word_correspondence[word_e]:
            if word_f == "<NULL>":
               translation_prob[(word_e, word_f)] = null_prob
            else:
                translation_prob[(word_e, word_f)] = (1 - null_prob) / (len(word_correspondence[word_e]) - 1)
            #translation_prob[(word_e, word_f)] = 1 / len(word_correspondence[word_e])

    return translation_prob




print(tokenize_corpus(["The dog runs", "The cat sleeps"]))
print(initialize_translation_prob(tokenize_corpus(["The dog runs", "The cat sleeps", "I am"]), tokenize_corpus(["Le chien court", "Le chat dort", "Je suis"], add_null=True)))

[['the', 'dog', 'runs'], ['the', 'cat', 'sleeps']]
{('the', '<NULL>'): 1e-05, ('the', 'le'): 0.199998, ('the', 'chien'): 0.199998, ('the', 'court'): 0.199998, ('the', 'chat'): 0.199998, ('the', 'dort'): 0.199998, ('dog', '<NULL>'): 1e-05, ('dog', 'le'): 0.33333, ('dog', 'chien'): 0.33333, ('dog', 'court'): 0.33333, ('runs', '<NULL>'): 1e-05, ('runs', 'le'): 0.33333, ('runs', 'chien'): 0.33333, ('runs', 'court'): 0.33333, ('cat', '<NULL>'): 1e-05, ('cat', 'le'): 0.33333, ('cat', 'chat'): 0.33333, ('cat', 'dort'): 0.33333, ('sleeps', '<NULL>'): 1e-05, ('sleeps', 'le'): 0.33333, ('sleeps', 'chat'): 0.33333, ('sleeps', 'dort'): 0.33333, ('i', '<NULL>'): 1e-05, ('i', 'je'): 0.499995, ('i', 'suis'): 0.499995, ('am', '<NULL>'): 1e-05, ('am', 'je'): 0.499995, ('am', 'suis'): 0.499995}


In [82]:
from collections import defaultdict

def ibm_model_1(corpus_english, corpus_foreign, iterations=10):
    # Assuming tokenize_corpus adds a "null" token to the beginning of each English sentence
    # and splits sentences into lists of words.
    corpus_foreign_tokens = tokenize_corpus(corpus_foreign, add_null=True)  # foreign language corpus
    corpus_english_tokens = tokenize_corpus(corpus_english)  # English corpus, with null word

    # Initialize translation probabilities uniformly
    translation_prob = initialize_translation_prob(corpus_english_tokens, corpus_foreign_tokens)

    for iteration in range(iterations):
        count_ef = defaultdict(float)
        total_e = defaultdict(float)
        
        # E-step: Expectation
        for sentence_e, sentence_f in zip(corpus_english_tokens, corpus_foreign_tokens):
            # For each word in the english sentence
            for word_f in sentence_f:
                # Compute normalization factor for the word2
                s_total_word_e = sum(translation_prob[(word_e, word_f)] for word_e in sentence_e)
                # For each word in the foreign sentence
                for word_e in sentence_e:
                    # Calculate delta, which is the proportion of the alignment probability of the word2 to the word1
                    delta = translation_prob[(word_e, word_f)] / s_total_word_e
                    # Update counts
                    count_ef[(word_e, word_f)] += delta
                    total_e[word_e] += delta
        
        # M-step: Maximization
        for (word_e, word_f), count in count_ef.items():
            translation_prob[(word_e, word_f)] = count / total_e[word_e]


        # normalize probabilities
        new_dict = {}
        for key, value in translation_prob.items():
            if key[0] not in new_dict:
                new_dict[key[0]] = value
            else:
                new_dict[key[0]] += value

        for key, value in translation_prob.items():
            translation_prob[key] = value / new_dict[key[0]]

    return translation_prob



In [83]:
# Example usage (using dummy data):
corpus1 = ["the house", "the book", "a big house"]
corpus2 = ["das haus", "das buch", "ein großes haus"]  # Assuming German for demonstration

print(initialize_translation_prob(tokenize_corpus(corpus1), tokenize_corpus(corpus2, add_null=True)))

# Estimate translation probabilities
translation_prob = ibm_model_1(corpus1, corpus2, iterations=100)

# Find translations for a specific word (e.g., "house")
translations_for_word = {pair[1]: prob for pair, prob in translation_prob.items() if pair[0] == "house"}
# Sort translations by probability
sorted_translations = sorted(translations_for_word.items(), key=lambda item: item[1], reverse=True)

# Print top N translations
print("Top translations for 'house':")
summm = 0
for foreign_word, prob in sorted_translations[:10]:
    print(f"{foreign_word}: {prob}")
    summm += prob
print(summm)



{('the', '<NULL>'): 1e-05, ('the', 'das'): 0.33333, ('the', 'haus'): 0.33333, ('the', 'buch'): 0.33333, ('house', '<NULL>'): 1e-05, ('house', 'das'): 0.2499975, ('house', 'haus'): 0.2499975, ('house', 'ein'): 0.2499975, ('house', 'großes'): 0.2499975, ('book', '<NULL>'): 1e-05, ('book', 'das'): 0.499995, ('book', 'buch'): 0.499995, ('a', '<NULL>'): 1e-05, ('a', 'ein'): 0.33333, ('a', 'großes'): 0.33333, ('a', 'haus'): 0.33333, ('big', '<NULL>'): 1e-05, ('big', 'ein'): 0.33333, ('big', 'großes'): 0.33333, ('big', 'haus'): 0.33333}
Top translations for 'house':
haus: 0.6693175783779879
<NULL>: 0.33068242162201217
das: 3.535814090616524e-27
ein: 2.6127305810792512e-33
großes: 2.6127305810792512e-33
1.0


Need to be very careful with the NULL probability. It needs to be way lower than the other probabilities, otherwise the model will always choose the NULL translation, because NULL will be present in every sentence. Increasing the number of iterations will eventually make null the most probable translation for every word.

In [84]:
#Write code that implements the estimation algorithm for IBM model 1.
# Then print, for either Swedish, German, or French, the 10 words that 
#the English word european is most likely to be translated into, according 
#to your estimate. It can be interesting to look at this list of 10 words and
#see how it changes during the EM iterations.

#reduce the size of corpus_de_en and corpus_sv_en
sv_en = open(en_sv_file_path, 'r', encoding='utf-8').read()
#separate the sentences
sv_en = sv_en.split("\n")
corpus_en = sv_en[:]

sv = open(sv_file_path, 'r', encoding='utf-8').read()
#separate the sentences
sv = sv.split("\n")
corpus_sv = sv[:]

#print(corpus_en)
#print(corpus_sv)

# Estimate translation probabilities
translation_prob = ibm_model_1(corpus_en, corpus_sv, iterations=30)


In [92]:
# Find translations for a specific word (e.g., "european")
translations_for_word = {pair[1]: prob for pair, prob in translation_prob.items() if pair[0] == "prosecutor"}
# Sort the top 10 translations by probability

suma = 0

for i in translations_for_word:
    suma = suma + translations_for_word[i]
swedish_translations = sorted(translations_for_word.items(), key=lambda item: item[1], reverse=True)[:10]

print("Top translations for 'european' in Swedish:")

for foreign_word, prob in swedish_translations:
    print(f"{foreign_word}: {prob}")

print(suma)

Top translations for 'european' in Swedish:
europeisk: 0.3629289715804933
åklagare: 0.3338594041812997
åklagarmyndighet: 0.16503145376272627
allmän: 0.10058253866137612
åklagarmyndigheten: 0.03614154802863032
en: 0.0012240205256121347
skulle: 0.00021618874044551938
vara: 1.577447072652295e-05
inrättas: 5.525971338434354e-08
ha: 1.0693275441837174e-08
1.0000000000000002


In [112]:
def get_top_n_word_translations(foreign_sentence, translation_prob, n):

    word_translations = {}

    for word in foreign_sentence:
        #get top 5 translations for each word
        translations_for_word = {pair[0]: prob for pair, prob in translation_prob.items() if pair[1] == word}
        # Sort translations by probability
        sorted_translations = sorted(translations_for_word.items(), key=lambda item: item[1], reverse=True)[:n]
        #print(f"Top translations for '{word}':")
        #for foreign_word, prob in sorted_translations[:n]:
         #   print(f"{foreign_word}: {prob}")
        #print("\n")

        word_translations[word] = sorted_translations

    return word_translations

""" for word in top_n_word_translations:

        # get 

        for i in top_n_word_translations[word]:
    
            sentences.append(i[0])

        translations.append(sentence)

    print("Translations:", translations)  """  


def translate_sentence(foreign_sentence, top_n_word_translations):

    translations = []

    #make a combination of every single possibility, maintaining the order of the words

    for word in foreign_sentence:

        if word in top_n_word_translations:

            if len(translations) == 0:

                for translation in top_n_word_translations[word]:

                    translations.append([translation])

            else:

                temp_translations = []

                for sentence in translations:

                    for translation in top_n_word_translations[word]:

                        temp_translations.append(sentence + [translation])

                translations = temp_translations


    #print("Translations:", translations)
    #raise ValueError("")

    #for translation in translations:
        #print(translation)

    highest_prob_sentence = []
    highest_prob = -10000000

    for sentence in translations:

        sentence_probability = calculate_sentence_prob_improved(" ".join([word[0] for word in sentence]))

        if sentence_probability > highest_prob:
            highest_prob = sentence_probability
            highest_prob_sentence = sentence

    #print("Highest probability sentence:", highest_prob_sentence)

    return " ".join([word[0] for word in highest_prob_sentence])





sentence = "jag är europeiska"

swedish_sentence = sentence.split()

top_5_word_translations = get_top_n_word_translations(swedish_sentence, translation_prob, 5)

#print(top_5_word_translations)

translated_sentence = translate_sentence(swedish_sentence, top_5_word_translations)

print("Translated sentence:", translated_sentence)


#print the frist 10 elements of translation_prob
#for i in range(10):
    #print(list(translation_prob.items())[i])

#jag förklarar europaparlamentets session återupptagen efter avbrottet den 17 december . jag vill på nytt önska er ett gott nytt år och jag hoppas att ni haft en trevlig semester .


Translated sentence: i am european


## Decoding