# Language Generation Evaluation

### Load dependencies (First, look at README)

In [None]:
import nltk
import csv
import random
import numpy as np
import time
import import_ipynb
import tampering_strategies as ts
import data_sampling as ds
from google import genai
import Levenshtein
from random_word import RandomWords
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pandas
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize
from nltk.translate import meteor
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('punkt_tab')

# Create dataframes

In [None]:
# name = "English"

# with open("../en.en", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/en/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

# name = "Danish"

# with open("../da.da", "r", encoding="utf-8") as f:
#     for i in range(200):
#         lines = [next(f).strip() for _ in range(10000)]
#         df = pandas.DataFrame({"Text": lines})
#         output_file = f"dataframes/da/{name}{i+1}.csv"
#         df.to_csv(output_file, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)


# danishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])
# englishData = pandas.read_fwf("test.txt", header=None, names=["test_sentences"])

### Define functions

In [None]:
rouge = Rouge()
r = RandomWords()

def calculate_rouge(candidate, reference):
    scores = rouge.get_scores(candidate, reference)
    return scores

def calculate_bleu(candidate, reference):
    reference_p = [word_tokenize(reference)]
    candidate_p = word_tokenize(candidate)
    score = sentence_bleu(reference_p, candidate_p)
    return score

def calculate_meteor(candidate, reference):
  reference = word_tokenize(reference)
  candidate = word_tokenize(candidate)
  meteor_score = round(meteor([candidate],reference), 4)
  return meteor_score

### Perform Rouge test

In [None]:
sen1 = "resumption of the session is now"
sen2 = "resumption the of session is now"
res = calculate_rouge(sen1, sen2)
print(res)

In [None]:
print(type(res))

### Perform BLEU test

In [None]:
print(calculate_bleu(sen1, sen2))

### Perform METEOR test

In [None]:
print(calculate_meteor(sen1, sen2))

# MAKE/RESET OUTPUT FILE

In [None]:
def resetTests():
    name = "output.csv"
    cols = ["originalSentence", "danishSentence", "tamperingType", "tamperedSentence", "LLMScore","Confidence_Lower", "Confidence_Upper", 
            "BLEU", "METEOR", "Rouge1 r", "Rouge1 p", "Rouge1 f", "Rouge2 r", "Rouge2 p", "Rouge2 f", "Rougel r", "Rougel p", "Rougel f"]

    with open(name, mode="w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=cols)
        writer.writeheader()

resetTests()

# RUN TESTS

In [None]:
import re

def Test(enFile, daFile, outputSize, llm):
    propLenIndeces = ds.properLengthIndeces(enFile)
    negIndeces = ds.negationIndeces(enFile)
    tampStrats = 10
    sensPerTamp = int(outputSize / tampStrats)
    indcs1 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs2 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs3 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs4 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs5 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs6 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs7 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs8 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcs9 = ds.getXRandomIndeces(sensPerTamp, propLenIndeces)
    indcsNeg = ds.getXRandomIndeces(sensPerTamp, negIndeces)

    def runTest(sen, tamp, tampType, da):
        outputName = "output.csv"
        columns = ["originalSentence", "danishSentence", "tamperingType", "tamperedSentence", 
                   "LLMScore", "Confidence_Lower", "Confidence_Upper", "BLEU", "METEOR", "Rouge1 r", "Rouge1 p", "Rouge1 f", "Rouge2 r", "Rouge2 p", "Rouge2 f", 
                   "Rougel r", "Rougel p", "Rougel f"]
        tampSen = tamp(sen)
        if tampSen[0]:
            rouge = calculate_rouge(sen, tampSen[1])
            res = {"originalSentence": sen, "danishSentence": da, "tamperingType": tampType, "tamperedSentence": tampSen[1], 
                   "LLMScore": -1, "Confidence_Lower": -1, "Confidence_Upper": -1, "BLEU": calculate_bleu(sen, tampSen[1]), "METEOR": calculate_meteor(sen, tampSen[1]), 
                   "Rouge1 r": rouge[0]["rouge-1"]['r'], "Rouge1 p": rouge[0]["rouge-1"]['p'], "Rouge1 f": rouge[0]["rouge-1"]['f'], 
                   "Rouge2 r": rouge[0]["rouge-2"]['r'], "Rouge2 p": rouge[0]["rouge-2"]['p'], "Rouge2 f": rouge[0]["rouge-2"]['f'], 
                   "Rougel r": rouge[0]["rouge-l"]['r'], "Rougel p": rouge[0]["rouge-l"]['p'], "Rougel f": rouge[0]["rouge-l"]['f']}
            with open(outputName, mode="a", newline="", encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=columns)
                
                writer.writerow(res)


    def testAllTamps(sen, da, indx):
        if indx in indcs1: runTest(sen.lower(), ts.replaceWithRandomWord, "Replace Token with Random Word", da)
        if indx in indcs2: runTest(sen.lower(), ts.swapWords, "Swap words", da.lower())
        if indx in indcs3: runTest(sen.lower(), ts.removeToken, "Remove Token", da.lower())
        if indx in indcs4: runTest(sen.lower(), ts.addRandomWord, "Add random word", da.lower())
        if indx in indcs5: runTest(sen.lower(), ts.duplicateToken, "Duplicate Token", da.lower())
        if indx in indcs6: runTest(sen.lower(), ts.replaceCharacter, "Replace Character", da.lower())
        if indx in indcs7: runTest(sen.lower(), ts.duplicateCharacter, "Duplicate character", da.lower())
        if indx in indcs8: runTest(sen.lower(), ts.removeCharacter, "Remove Character" ,da.lower())
        if indx in indcs9: runTest(sen.lower(), ts.swapCharacters, "Swap characters", da.lower())
        if indx in indcsNeg: runTest(sen.lower(), ts.negation, "Negation", da.lower())
        

    def testDataSets(en, da):
        enSens = pandas.read_csv(en, encoding='utf-8')
        daSens = pandas.read_csv(da, encoding='utf-8')
        for index, row in enSens.iterrows():
            original = row["Text"]
            transRef = daSens.iloc[index]["Text"]
            testAllTamps(original, transRef, index)

    
    def makePrompt(dan, en, tamp):
        res = f"""Here are two sentences perfectly translated from Danish into English:

            Sentence 1 (Danish): {dan}
            Sentence 2 (English): {en}

            Now, I will give you a third sentence, which is a tampered English translation: Sentence 3: {tamp}. 

            Compare the semantic similarity between Sentence 2 and Sentence 3 on a scale from 0 (totally different semantically) to 1 (totally identical semantically). 
            Provide a similarity score and confidence intervals as three values, each with two decimal places, in this exact plain text format:

            Similarity Score: X.XX
            Confidence Lower Bound: X.XX
            Confidence Upper Bound: X.XX

            All three values are required.
            """

        return res


    def addLLMScore(res):
        client = genai.Client(api_key="AIzaSyBEx6cNuDD9XgrV0oO10TZ7ZQzzddCr_r8")
        data = pandas.read_csv(res, encoding='utf-8')
        for index, row in data.iterrows():
            english = row["originalSentence"]
            danish = row["danishSentence"]
            tamp = row["tamperedSentence"]
            response = client.models.generate_content(
                model="gemini-2.0-flash", contents=makePrompt(danish, english, tamp)
                )
            
            print(f"Response at index {index}: {response.text}")

            score_match = re.search(r"Similarity Score: (\d\.\d{2})", response.text)
            lower_match = re.search(r"Confidence Lower Bound: (\d\.\d{2})", response.text)
            upper_match = re.search(r"Confidence Upper Bound: (\d\.\d{2})", response.text)

            if score_match:
                similarity_score = score_match.group(1)  
            if lower_match:
                confidence_lower = lower_match.group(1)
            if upper_match:
                confidence_upper = upper_match.group(1) 

            data.loc[index, "LLMScore"] = similarity_score
            data.loc[index, "Confidence_Lower"] = confidence_lower
            data.loc[index, "Confidence_Upper"] = confidence_upper
            
            time.sleep(4)

        data.to_csv("output.csv", index=False, encoding='utf-8')

    testDataSets(enFile, daFile)
    if llm: addLLMScore("output.csv")

# Test("en_test_data.csv", "da_test_data.csv", 20, True)
# Test("en_test_data.csv", "da_test_data.csv", 20, False)
Test("dataframes/en/English1.csv", "dataframes/da/Danish1.csv", 10, True)