In [2]:
import math
from collections import defaultdict


In [3]:
def load_corpus(file_path, max_sentences=50000):
    """
    Loads sentences from a Europarl file
    max_sentences limits size for faster execution
    """
    sentences = []
    with open(file_path, encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= max_sentences:
                break
            line = line.strip()
            if line:
                sentences.append(line)
    return sentences


In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    tokens = sentence.split()
    return ['<s>'] + tokens + ['</s>']


In [5]:
def build_vocabulary(corpus):
    vocab = set()
    for sent in corpus:
        vocab.update(preprocess_sentence(sent))
    return vocab


In [6]:
def build_ngram_counts(corpus, n):
    ngram_counts = defaultdict(int)
    context_counts = defaultdict(int)

    for sentence in corpus:
        tokens = preprocess_sentence(sentence)
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i+n])
            context = tuple(tokens[i:i+n-1])
            ngram_counts[ngram] += 1
            context_counts[context] += 1

    return ngram_counts, context_counts


In [7]:
def add_one_prob(ngram, ngram_counts, context_counts, vocab_size):
    context = ngram[:-1]
    return (ngram_counts[ngram] + 1) / (context_counts[context] + vocab_size)


In [8]:
def sentence_probability(sentence, n, ngram_counts, context_counts, vocab_size):
    tokens = preprocess_sentence(sentence)
    prob = 1.0

    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        prob *= add_one_prob(ngram, ngram_counts, context_counts, vocab_size)

    return prob


In [9]:
def perplexity(sentence, n, ngram_counts, context_counts, vocab_size):
    tokens = preprocess_sentence(sentence)
    log_prob = 0.0
    N = len(tokens)

    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        p = add_one_prob(ngram, ngram_counts, context_counts, vocab_size)
        log_prob += math.log(p)

    return math.exp(-log_prob / N)


In [10]:
# CHANGE THESE PATHS IF NEEDED
swedish_file = "europarl-v6.sv-en.sv"
english_file = "europarl-v6.sv-en.en"

swedish_corpus = load_corpus(swedish_file)
english_corpus = load_corpus(english_file)

print("Total Swedish sentences:", len(swedish_corpus))


Total Swedish sentences: 49860


In [11]:
vocab = build_vocabulary(swedish_corpus)
V = len(vocab)

# Unigram
uni_counts, uni_context = build_ngram_counts(swedish_corpus, 1)

# Bigram
bi_counts, bi_context = build_ngram_counts(swedish_corpus, 2)

# Trigram
tri_counts, tri_context = build_ngram_counts(swedish_corpus, 3)

print("Vocabulary size:", V)


Vocabulary size: 69457


In [12]:
translations = {
    "T1": "detta är ett viktigt beslut",
    "T2": "detta är viktig ett beslut",
    "T3": "detta beslut är viktigt"
}

models = {
    "UNIGRAM": (1, uni_counts, uni_context),
    "BIGRAM": (2, bi_counts, bi_context),
    "TRIGRAM": (3, tri_counts, tri_context)
}

for model_name, (n, counts, context) in models.items():
    print(f"\n{model_name} MODEL RESULTS\n")

    for key, sent in translations.items():
        prob = sentence_probability(sent, n, counts, context, V)
        pp = perplexity(sent, n, counts, context, V)

        print(f"{key}")
        print("Sentence:", sent)
        print("Probability:", prob)
        print("Perplexity:", pp)
        print("-" * 40)



UNIGRAM MODEL RESULTS

T1
Sentence: detta är ett viktigt beslut
Probability: 1.9498497114898908e-16
Perplexity: 175.50340696413983
----------------------------------------
T2
Sentence: detta är viktig ett beslut
Probability: 9.894039377609593e-17
Perplexity: 193.36381015176033
----------------------------------------
T3
Sentence: detta beslut är viktigt
Probability: 2.4475446644876488e-14
Perplexity: 185.58586510379428
----------------------------------------

BIGRAM MODEL RESULTS

T1
Sentence: detta är ett viktigt beslut
Probability: 8.201568187875333e-19
Perplexity: 383.46789926125626
----------------------------------------
T2
Sentence: detta är viktig ett beslut
Probability: 3.1554171066748876e-21
Perplexity: 848.6061501021892
----------------------------------------
T3
Sentence: detta beslut är viktigt
Probability: 6.670289053861446e-18
Perplexity: 728.8573834547007
----------------------------------------

TRIGRAM MODEL RESULTS

T1
Sentence: detta är ett viktigt beslut
Probabili