In [None]:
import numpy as np
import random 
from operator import itemgetter 
from collections import Counter, defaultdict
import sys

dataset = {"sv": "europarl-v7.sv-en.lc.sv",
           "en": "europarl-v7.sv-en.lc.en"}

MIN_PROB = sys.float_info.min

In [None]:
class Corpus:
  def __init__(self, file_name, encoding):
    with open(file_name, encoding=encoding) as f:
      all_lines = f.readlines()

    self.documents = self.__get_documents(all_lines)
    self.word_counts = self.__get_counts(self.documents)

  def __get_documents(self, lines: list) -> list:
    documents = []
    for line in lines:
      words = line.split()
      documents.append(words)
    return documents

  def __get_counts(self, documents: list) -> Counter:
    counts = Counter()
    for i, doc in enumerate(documents):
      for word in doc:
        counts[word] += 1
    return counts

  def get_vocab(self) -> list:
    return self.word_counts.keys()

In [None]:
# Load vocabs and documents, print some example sentences
corp_sv = Corpus(file_name=dataset['sv'], encoding="UTF-8")
corp_en = Corpus(file_name=dataset['en'], encoding="UTF-8")
for doc_sv, doc_en in zip(corp_sv.documents[0:2], corp_en.documents[0:2]):
  print(doc_sv)
  print(doc_en)
  print("")

['jag', 'förklarar', 'europaparlamentets', 'session', 'återupptagen', 'efter', 'avbrottet', 'den', '17', 'december', '.', 'jag', 'vill', 'på', 'nytt', 'önska', 'er', 'ett', 'gott', 'nytt', 'år', 'och', 'jag', 'hoppas', 'att', 'ni', 'haft', 'en', 'trevlig', 'semester', '.']
['i', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'european', 'parliament', 'adjourned', 'on', 'friday', '17', 'december', '1999', ',', 'and', 'i', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period', '.']

['som', 'ni', 'kunnat', 'konstatera', 'ägde', '&quot;', 'den', 'stora', 'år', '2000-buggen', '&quot;', 'aldrig', 'rum', '.', 'däremot', 'har', 'invånarna', 'i', 'ett', 'antal', 'av', 'våra', 'medlemsländer', 'drabbats', 'av', 'naturkatastrofer', 'som', 'verkligen', 'varit', 'förskräckliga', '.']
['although', ',', 'as', 'you', 'will', 'have', 'seen', ',', 'the', 'dreaded', '&apos;', 'millenni

### a) Warmup


In [None]:
# Most common words
print([word for word, _ in corp_sv.word_counts.most_common()[0:10]])
print([word for word, _ in corp_en.word_counts.most_common()[0:10]])

['.', 'att', ',', 'och', 'i', 'det', 'som', 'för', 'av', 'är']
['the', ',', '.', 'of', 'to', 'and', 'in', 'is', 'that', 'a']


In [None]:
# Counts and probability of speaker and zebra
n_words_en = sum(corp_en.word_counts.values())
speaker_prob = corp_en.word_counts['speaker']/n_words_en
zebra_prob = corp_en.word_counts['zebra']/n_words_en
print(f"'speaker' - count: {corp_en.word_counts['speaker']}, prob: {speaker_prob:.7f}")
print(f"'zebra' - count: {corp_en.word_counts['zebra']},  prob: {zebra_prob}")

'speaker' - count: 10, prob: 0.0000355
'zebra' - count: 0,  prob: 0.0


### b) Language modeling
-  *Implement a bigram language model as described in the lecture, and use it to compute the probability of a short sentence.*
-  *What happens if you try to compute the probability of a sentence that contains a word that did not appear in the training texts? And what happens if your sentence is very long (e.g. 100 words or more)? Optionally, change your code so that it can handle these challenges.*

If we have not seen a pair of words that pair will have a probability of 0. Since we are multiplying the probabilities the full sentence in that case will get a probability of 0 as well. The same is true if there is an unseen word in the sentence.

For long sequences the probability goes to zero.
Therefore, in order to prevent underflow, we add together the log probabilities instead of multiplying the probabilities.

We handle unseen words simply by using defaultdicts that return probability 0 for unseen bigrams.


In [None]:
# Bigram language model
class Bigram:
  def __init__(self, corpus: Corpus):
    self.bi_probs = self.__calculate_probs(corpus)

  """
  Caluclates probabilities of bigrams
  """
  def __calculate_probs(self, corpus: Corpus): 
    # Count occurences of words coming after a word
    bigram_counter = defaultdict(Counter)
    for words in corpus.documents:
      for w1, w2 in zip(words, words[1:]):
          bigram_counter[w1][w2] += 1

    # Convert counts to probabilities with a default MIN_PROB
    bigram_probs = defaultdict(lambda: defaultdict(lambda: MIN_PROB))
    for w1, counter in bigram_counter.items():
      w1_count = sum(counter.values())
      for w2, w2_count in counter.items():
        bigram_probs[w1][w2] = w2_count/w1_count

    return bigram_probs

  """Calculate log probability of given sentence (list of words)"""
  def language_modeling(self, words: list):
    log_probs = 0.0
    for w1, w2 in zip(words, words[1:]):
      log_prob = np.log(self.bi_probs[w1][w2])
      log_probs += log_prob
    return log_probs

bigram = Bigram(corp_en)
print(bigram.bi_probs["speaker"].items())
print(bigram.bi_probs["speaker"]["said"])
print(bigram.bi_probs["speaker"]["nonWord"])
print(bigram.bi_probs["nonWord"]["speaker"])

dict_items([('in', 0.1), ('has', 0.1), (',', 0.2), ('already', 0.1), ('off', 0.3), ('had', 0.1), ('said', 0.1)])
0.1
2.2250738585072014e-308
2.2250738585072014e-308


In [None]:
sentence_short = "i would like".split()
sentence_long = ("i would like "*50).split()
log_prob_short = bigram.language_modeling(sentence_short)
prob_short = np.exp(log_prob_short)
log_prob_long = bigram.language_modeling(sentence_long)
prob_long = np.exp(log_prob_long)
print(f"Short sentence log prob: {log_prob_short}, prob: {prob_short}\n")
print(f"Long sentence log prob: {log_prob_long}, prob: {prob_long}\n")


Short sentence log prob: -2.956367550031714, prob: 0.052007489078427296

Long sentence log prob: -34859.24288558249, prob: 0.0



### c) Translation modeling

Let's denote target language we wish to translate to as $t$ and source language we wish to translate from as $s$.

-  *If our goal is to translate from some language (s) into English (t), why does our conditional probability seem to be written backwards? Why don't we estimate $p(t|s)$ instead?*

From Bayes rule we know that 
\begin{equation}
p(t|s) \propto_t p(s|t)p(t)
\end{equation}
so our objective can be written as
\begin{equation}
t^* = \arg \max_{t \in T}  p(t|s) = \arg \max_{t \in T} p(s|t)p(t)
\end{equation}

which is suitable since we can use a language model $p(t)$ trained from arbitrary target language text to consider grammar and fluency, while at the same time $p(s|t)$ will handle the translation probabilites.


In [None]:
from numpy.lib.function_base import kaiser
class TranslationModel:

  def __init__(self, src_corpus: Corpus, trg_corpus: Corpus):
    self.src_sentences = src_corpus.documents
    self.src_vocab = src_corpus.get_vocab()
    self.trg_sentences = trg_corpus.documents
    self.trg_vocab = trg_corpus.get_vocab()


  """
  Calculates translation probs with EM algorithm.
  Input a target word to print 10 most aligned words every iteration.
  """
  def calculate_translation_probs(self, trg_word_test=None):
    # Represent t(s|t) as nested dicts so that we can index like t[s][t]  
    # Use uniform initial translation probs
    initial_prob = 1/len(self.src_vocab) 
    self.trans_probs = defaultdict(lambda: defaultdict(lambda: max(initial_prob, MIN_PROB)))

    # Initialization of EM algorithm
    n_iters=5
    for i in range(1, n_iters+1):
      if i % 1 == 0:
        print(f"[{i}/{n_iters}]")
      self.__em_iteration(trg_word_test)
    
    
  """
  Performs one iteration of the EM algoritm and updates 'trans_probs'
  """
  def __em_iteration(self, trg_word_test):
    # Reset counts
    t_softcount = defaultdict(lambda: defaultdict(lambda: MIN_PROB))
    s_softcount = defaultdict(lambda: MIN_PROB)

    for src_sentence, trg_sentence in zip(self.src_sentences, self.trg_sentences): # For each sentence pair
      trg_sentence = ["NULL"] + trg_sentence # Add NULL word to target sentence 

      total_count = defaultdict(lambda: MIN_PROB)
      for t in trg_sentence: 
        for s in src_sentence:
          total_count[t] += self.trans_probs[s][t]
      
      for t in trg_sentence: # For each target language word 
        for s in src_sentence: # For each source language word 

          # Compute alignment prob 
          align_prob = self.trans_probs[s][t]/total_count[t]

          # Update pseudocount
          t_softcount[s][t] += align_prob
          s_softcount[s] += align_prob

    # Re-estimate probabilities
    for s, s_dict in t_softcount.items():
      for t, prob in s_dict.items():
        self.trans_probs[s][t] = t_softcount[s][t] / s_softcount[s]
    
    # Print 10 most likely word for a target word
    if trg_word_test:
      self.print_10_most_likely_src_words(trg_word=trg_word_test)
      print("-"*30 + "\n")


  """
  Get the top-k most likely target words given a source word
  """
  def get_top_k_likely_trg_words(self, src_word, k=10) -> list:
    trg_probs = sorted(self.trans_probs[src_word].items(), key=lambda item: item[1])
    trg_probs.reverse()
    return trg_probs[:k]


  """
  Get the top-k most likely source words given a target word
  """
  def get_top_k_likely_src_words(self, trg_word, k=10):
    src_probs = {s: self.trans_probs[s][trg_word] for s in self.src_vocab }
    src_probs_sorted = sorted(src_probs.items(), key=lambda item: item[1])
    src_probs_sorted.reverse()
    return src_probs_sorted[:k]


  """
  Finds and prints the 10 most likely source words given a target word
  """
  def print_10_most_likely_src_words(self, trg_word):
    print(f"Most likely words for '{trg_word}'")
    top_10_src = self.get_top_k_likely_src_words(trg_word,k=10)
    for word, prob in top_10_src:
      print(f"'{word}' : {prob:.3f}")  
    

In [None]:
trans_model = TranslationModel(corp_sv, corp_en)
trans_model.calculate_translation_probs(trg_word_test="european")

[1/5]
Most likely words for 'european'
'lugnas' : 0.125
'flygsäkerheten' : 0.100
'sovjetunionen' : 0.100
'hörd' : 0.100
'skadat' : 0.091
'enhetsakten' : 0.091
'kärnpunkt' : 0.091
'attraktivt' : 0.083
'skikt' : 0.077
'europagrupp' : 0.077
------------------------------

[2/5]
Most likely words for 'european'
'europeisk' : 0.225
'europeiska' : 0.211
'europaparlamentet' : 0.175
'lugnas' : 0.151
'europeiskt' : 0.118
'europaparlamentets' : 0.115
'sovjetunionen' : 0.114
'hörd' : 0.107
'csu' : 0.105
'unionen' : 0.100
------------------------------

[3/5]
Most likely words for 'european'
'europeisk' : 0.526
'europeiska' : 0.473
'europaparlamentet' : 0.326
'europeiskt' : 0.311
'europaparlamentets' : 0.244
'lugnas' : 0.205
'valdeltagandet' : 0.147
'unionen' : 0.135
'csu' : 0.127
'sa' : 0.120
------------------------------

[4/5]
Most likely words for 'european'
'europeisk' : 0.712
'europeiska' : 0.655
'europeiskt' : 0.504
'europaparlamentet' : 0.409
'europaparlamentets' : 0.334
'lugnas' : 0.248


### Decoding

Given a source-language sentence S (foreign language), find target sentence T (english) that has the highest probability according to our model.

\begin{equation}
T^* =  \arg \max_{T \in \mathbf{T}} p(S|T)p(T)
\end{equation}

The problem here is to iterate over all possible target sentences T and choose the one wwhich maximizes this objective -> combinatorial problem.



Steps:

1) Generate $N$ likely example translations $T_i$.  

2) Calculate probability of the example translations with language model $P(T_i)$

3) Pick highest probability sentence $P(S|T_i)P(T_i)$

Hard part is step 1.

Assumptions:
- The sentences can only include words already in the corpuses
- Markov property: the next word depends only on the current word. Then
\begin{equation}
 P(S|T) \approx P(s_1|t_1) \cdot P(s_2|t_2) \cdot ... \cdot P(s_n|t_n)
\end{equation}
- $P(T)$ can be calcualted using bigram modeling (markov and independece?)

In [None]:
source_sentence = "den socialistiska gruppen har begärt ett uttalande från kommissionen om dess strategiska mål för de fem kommande åren ."

In [None]:
# Generate an example translation T and estimate P(T|S) = P(S|T)P(T)

log_p_st = 0
translation=[]
for src_word in source_sentence.split():
  assert src_word in trans_model.src_vocab, print(f"Error: '{src_word}' out of vocabulary")
  # Find most likely translated word for this source word
  top_k = trans_model.get_top_k_likely_trg_words(src_word, k=1)
  t, prob = top_k[0]
  log_p_st += np.log(prob)
  translation.append(t)

#2) Calculate P(T) 
log_p_t = bigram.language_modeling(translation)

#3) Total prob of translated sentence
log_p = log_p_st + log_p_t

# Get translated string
translated_sentence = " ".join(translation)

print(f"Translated\n '{source_sentence}'\n into \n '{translated_sentence}'")
print(f"Total log probability: {log_p}")

Translated
 'den socialistiska gruppen har begärt ett uttalande från kommissionen om dess strategiska mål för de fem kommande åren .'
 into 
 'the socialist group have requested a statement from commission on its strategic objective for the five next years .'
Total log probability: -2206.352595402137
