# Language Generation Evaluation

### Load dependencies (First, look at README)

In [None]:
import nltk
import csv
import random
import numpy as np
import time
import import_ipynb
import tampering_strategies as ts
from google import genai
import Levenshtein
from random_word import RandomWords
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pandas
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
from nltk.translate import meteor
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('punkt_tab')

# Create dataframes

In [None]:
# name = "English"

# with open("../en.en", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/en/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

# name = "Danish"

# with open("../da.da", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/da/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)


# danishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])
# englishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])

### Define functions

In [None]:
rouge = Rouge()
r = RandomWords()

def calculate_rouge(candidate, reference):
    scores = rouge.get_scores(candidate, reference)
    return scores

def calculate_bleu(candidate, reference):
    reference_p = [word_tokenize(reference)]
    candidate_p = word_tokenize(candidate)
    score = sentence_bleu(reference_p, candidate_p)
    return score

def calculate_meteor(candidate, reference):
  reference = word_tokenize(reference)
  candidate = word_tokenize(candidate)
  meteor_score = round(meteor([candidate],reference), 4)
  return meteor_score

### Perform Rouge test

In [None]:
sen1 = "Hello, what are you doing?"
sen2 = "Hello, what you are doing?"
res = calculate_rouge(sen1, sen2)
print(res)

### Perform BLEU test

In [None]:
print(calculate_bleu(sen1, sen2))

### Perform METEOR test

In [None]:
print(calculate_meteor(sen1, sen2))

# MAKE/RESET OUTPUT FILE

In [None]:
def resetTests():
    name = "output.csv"
    cols = ["originalSentence", "danishSentence", "tamperingType", "tamperedSentence", "LLMScore", "BLEU", "METEOR", "Rouge1 r", "Rouge1 p", "Rouge1 f", "Rouge2 r", "Rouge2 p", "Rouge2 f", "Rougel r", "Rougel p", "Rougel f"]

    with open(name, mode="w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=cols)
        writer.writeheader()

resetTests()

# RUN TESTS

In [None]:
def Test(enFile, daFile, llm):
    def runTest(sen, tamp, tampType, da):
        outputName = "output.csv"
        columns = ["originalSentence", "danishSentence", "tamperingType", "tamperedSentence", "LLMScore", "BLEU", "METEOR", "Rouge1 r", "Rouge1 p", "Rouge1 f", "Rouge2 r", "Rouge2 p", "Rouge2 f", "Rougel r", "Rougel p", "Rougel f"]
        tampSen = tamp(sen)
        if tampSen[0]:
            rouge = calculate_rouge(sen, tampSen[1])
            res = {"originalSentence": sen, "danishSentence": da, "tamperingType": tampType, "tamperedSentence": tampSen[1], "LLMScore": -1, "BLEU": calculate_bleu(sen, tampSen[1]), "METEOR": calculate_meteor(sen, tampSen[1]), "Rouge1 r": rouge[0]["rouge-1"]['r'], "Rouge1 p": rouge[0]["rouge-1"]['p'], "Rouge1 f": rouge[0]["rouge-1"]['f'], "Rouge2 r": rouge[0]["rouge-2"]['r'], "Rouge2 p": rouge[0]["rouge-2"]['p'], "Rouge2 f": rouge[0]["rouge-2"]['f'], "Rougel r": rouge[0]["rouge-l"]['r'], "Rougel p": rouge[0]["rouge-l"]['p'], "Rougel f": rouge[0]["rouge-l"]['f']}
            with open(outputName, mode="a", newline="", encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=columns)
                
                writer.writerow(res)


    def testAllTamps(sen, da):
        runTest(sen, ts.replaceWithRandomWord, "Replace Token with Random Word", da)
        runTest(sen, ts.swapWords, "Swap words", da)
        runTest(sen, ts.removeToken, "Remove Token", da)
        runTest(sen, ts.addRandomWord, "Add random word", da)
        runTest(sen, ts.duplicateToken, "Duplicate Token", da)
        runTest(sen, ts.replaceCharacter, "Replace Character", da)
        runTest(sen, ts.duplicateCharacter, "Duplicate character", da)
        runTest(sen, ts.removeCharacter, "Remove Character" ,da)
        runTest(sen, ts.swapCharacters, "Swap characters", da)
        runTest(sen, ts.negation, "Negation", da)
        

    def testDataSets(en, da):
        enSens = pandas.read_csv(en, encoding='utf-8')
        daSens = pandas.read_csv(da, encoding='utf-8')
        for index, row in enSens.iterrows():
            original = row["Text"]
            transRef = daSens.iloc[index]["Text"]
            testAllTamps(original, transRef)

    def makePrompt(dan, en, tamp):
        res = f"""Here are two sentences perfectly translated from Danish into English:

        Sentence 1: {dan}
        Sentence 2: {en}

    Now I will give you a third sentence, which is another translation in English. Compare the similarity between sentence 2 and 3 on a scale from 0 (totally different) to 1 (totally identical). Return ONLY that number.

        Sentence 3: {tamp}"""

        return res

    def addLLMScore(res):
        client = genai.Client(api_key="AIzaSyBEx6cNuDD9XgrV0oO10TZ7ZQzzddCr_r8")
        data = pandas.read_csv(res, encoding='utf-8')
        for index, row in data.iterrows():
            english = row["originalSentence"]
            danish = row["danishSentence"]
            tamp = row["tamperedSentence"]
            response = client.models.generate_content(
                model="gemini-2.0-flash", contents=makePrompt(danish, english, tamp)
            )
            llmRes = response.text
            llmResClean = llmRes.replace("\n","")
            data.at[index, "LLMScore"] = llmResClean
            time.sleep(4)
        data.to_csv("output.csv", index=False, encoding='utf-8')

    testDataSets(enFile, daFile)
    if llm: addLLMScore("output.csv")

#Test("en_test_data.csv", "da_test_data.csv", True)
Test("dataframes/en/English1.csv", "dataframes/da/Danish1.csv", False)

### Run tests for negation strategy

In [None]:
# Test("English1_negation.csv", "dataframes/da/Danish1.csv", False)

### ANALYZE THE RESULTS

In [None]:
file_path = "output.csv"
df = pandas.read_csv(file_path, encoding='utf-8')

In [None]:
print(df.info())

In [None]:
print(df.head())

In [None]:
columns_of_interest = ['BLEU', 'METEOR', 'Rouge1 r', 'Rouge1 p', 'Rouge1 f',
                      'Rouge2 r', 'Rouge2 p', 'Rouge2 f', 'Rougel r', 'Rougel p', 'Rougel f']

mean_by_strategy = df.groupby("tamperingType")[columns_of_interest].mean()
median_by_strategy = df.groupby("tamperingType")[columns_of_interest].median()
std_by_strategy = df.groupby("tamperingType")[columns_of_interest].std()

print("Mean by Tampering Strategy:\n", mean_by_strategy)
print("\nMedian by Tampering Strategy:\n", median_by_strategy)
print("\nStandard Deviation by Tampering Strategy:\n", std_by_strategy)

mean_values = df[columns_of_interest].mean()
median_values = df[columns_of_interest].median()
std_values = df[columns_of_interest].std()

print("\nOverall Mean:\n", mean_values)
print("\nOverall Median:\n", median_values)
print("\nOverall Standard Deviation:\n", std_values)