# Analysis of the Europarl Dataset

In [None]:
import nltk
import csv
import random
import numpy as np
import time
from google import genai
import Levenshtein
from random_word import RandomWords
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pandas
nltk.download("punkt")

In [None]:
# Load the dataset
def load_data(danish_file, english_file):
    with open(danish_file, "r", encoding="utf-8") as f_da, open(english_file, "r", encoding="utf-8") as f_en:
        danish_sentences = [line.strip() for line in f_da.readlines()]
        english_sentences = [line.strip() for line in f_en.readlines()]
    return danish_sentences, english_sentences

danish_sentences, english_sentences = load_data("../da.da", "../en.en")

# Tokenization
danish_tokens = [word_tokenize(sent.lower()) for sent in danish_sentences]
english_tokens = [word_tokenize(sent.lower()) for sent in english_sentences]

# Flatten lists for word statistics
danish_words = [word for sent in danish_tokens for word in sent]
english_words = [word for sent in english_tokens for word in sent]

In [None]:
num_sentences_da = len(danish_sentences)
num_sentences_en = len(english_sentences)

num_words_da = len(danish_words)
num_words_en = len(english_words)

unique_words_da = len(set(danish_words))
unique_words_en = len(set(english_words))

ttr_da = unique_words_da / num_words_da * 100
ttr_en = unique_words_en / num_words_en * 100

avg_length_da = np.mean([len(sent) for sent in danish_tokens])
avg_length_en = np.mean([len(sent) for sent in english_tokens])

std_length_da = np.std([len(sent) for sent in danish_tokens])
std_length_en = np.std([len(sent) for sent in english_tokens])

min_length_da = np.min([len(sent) for sent in danish_tokens])
min_length_en = np.min([len(sent) for sent in english_tokens])

max_length_da = np.max([len(sent) for sent in danish_tokens])
max_length_en = np.max([len(sent) for sent in english_tokens])

sentence_length_ratio = avg_length_da / avg_length_en 

print(f"Total Sentences: Danish = {num_sentences_da}, English = {num_sentences_en}")
print(f"Total Words: Danish = {num_words_da}, English = {num_words_en}")
print(f"Unique Words: Danish = {unique_words_da}, English = {unique_words_en}")
print(f"Type-Token Ratio: Danish = {ttr_da:.2f}%, English = {ttr_en:.2f}%")
print(f"Avg. Sentence Length: Danish = {avg_length_da:.2f}, English = {avg_length_en:.2f}")
print(f"Std Dev Sentence Length: Danish = {std_length_da:.2f}, English = {std_length_en:.2f}")
print(f"Sentence Length Ratio (DA/EN): {sentence_length_ratio:.2f}")
print(f"Min sentence length (DA/EN): Danish = {min_length_da}, English = {min_length_en}")
print(f"Max sentence length (DA/EN): Danish = {max_length_da}, English = {max_length_en}")