# Language Generation Evaluation

### Analysis of the Europarl Dataset

In [17]:
import nltk
import csv
import numpy as np
import Levenshtein
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pandas
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oskar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
# Load the dataset
def load_data(danish_file, english_file):
    with open(danish_file, "r", encoding="utf-8") as f_da, open(english_file, "r", encoding="utf-8") as f_en:
        danish_sentences = [line.strip() for line in f_da.readlines()]
        english_sentences = [line.strip() for line in f_en.readlines()]
    return danish_sentences, english_sentences

danish_sentences, english_sentences = load_data("../da.da", "../en.en")

# Tokenization
danish_tokens = [word_tokenize(sent.lower()) for sent in danish_sentences]
english_tokens = [word_tokenize(sent.lower()) for sent in english_sentences]

# Flatten lists for word statistics
danish_words = [word for sent in danish_tokens for word in sent]
english_words = [word for sent in english_tokens for word in sent]



In [19]:
num_sentences_da = len(danish_sentences)
num_sentences_en = len(english_sentences)

num_words_da = len(danish_words)
num_words_en = len(english_words)

unique_words_da = len(set(danish_words))
unique_words_en = len(set(english_words))

ttr_da = unique_words_da / num_words_da * 100
ttr_en = unique_words_en / num_words_en * 100

avg_length_da = np.mean([len(sent) for sent in danish_tokens])
avg_length_en = np.mean([len(sent) for sent in english_tokens])

std_length_da = np.std([len(sent) for sent in danish_tokens])
std_length_en = np.std([len(sent) for sent in english_tokens])

min_length_da = np.min([len(sent) for sent in danish_tokens])
min_length_en = np.min([len(sent) for sent in english_tokens])

max_length_da = np.max([len(sent) for sent in danish_tokens])
max_length_en = np.max([len(sent) for sent in english_tokens])

sentence_length_ratio = avg_length_da / avg_length_en 

print(f"Total Sentences: Danish = {num_sentences_da}, English = {num_sentences_en}")
print(f"Total Words: Danish = {num_words_da}, English = {num_words_en}")
print(f"Unique Words: Danish = {unique_words_da}, English = {unique_words_en}")
print(f"Type-Token Ratio: Danish = {ttr_da:.2f}%, English = {ttr_en:.2f}%")
print(f"Avg. Sentence Length: Danish = {avg_length_da:.2f}, English = {avg_length_en:.2f}")
print(f"Std Dev Sentence Length: Danish = {std_length_da:.2f}, English = {std_length_en:.2f}")
print(f"Sentence Length Ratio (DA/EN): {sentence_length_ratio:.2f}")
print(f"Min sentence length (DA/EN): Danish = {min_length_da}, English = {min_length_en}")
print(f"Max sentence length (DA/EN): Danish = {max_length_da}, English = {max_length_en}")

Total Sentences: Danish = 1991647, English = 1991647
Total Words: Danish = 52013315, English = 55093067
Unique Words: Danish = 370508, English = 136004
Type-Token Ratio: Danish = 0.71%, English = 0.25%
Avg. Sentence Length: Danish = 26.12, English = 27.66
Std Dev Sentence Length: Danish = 15.67, English = 16.42
Sentence Length Ratio (DA/EN): 0.94
Min sentence length (DA/EN): Danish = 1, English = 1
Max sentence length (DA/EN): Danish = 494, English = 776


# Create dataframes

In [20]:
# name = "English"

# with open("../en.en", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/en/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

# name = "Danish"

# with open("../da.da", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/da/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)


# danishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])
# englishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])

### Load dependencies (First, look at README)

In [21]:
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
from nltk.translate import meteor
import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oskar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\oskar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Define functions

In [22]:
rouge = Rouge()

def calculate_rouge(candidate, reference):
    scores = rouge.get_scores(candidate, reference)
    return scores

def calculate_bleu(candidate, reference):
    reference_p = [word_tokenize(reference)]
    candidate_p = word_tokenize(candidate)
    score = sentence_bleu(reference_p, candidate_p)
    return score

def calculate_meteor(candidate, reference):
  reference = word_tokenize(reference)
  candidate = word_tokenize(candidate)
  meteor_score = round(meteor([candidate],reference), 4)
  return meteor_score

### Load translation input

In [23]:
# Load the professional translation of sentence 1-27
with open("europarl-v7.da-en.da", "r", encoding="utf-8") as file:
    proTrans = "".join([next(file) for _ in range(27)])

# Load the ChatGPT translation of sentence 1-27
with open("ChatgptEnglishToDanish.txt", "r", encoding="utf-8") as file:
    chatGptTrans = "".join([next(file) for _ in range(27)])

FileNotFoundError: [Errno 2] No such file or directory: 'europarl-v7.da-en.da'

### Perform Rouge test

In [None]:
print(calculate_rouge(chatGptTrans, proTrans))

### Perform BLEU test

In [None]:
print(calculate_bleu(chatGptTrans, proTrans))

### Perform METEOR test

In [None]:
print(calculate_meteor(chatGptTrans, proTrans))