In [None]:
from collections import Counter
import numpy as np
import re 

def normalize_answer(s):
    # Lower case, remove punctuation, and whitespaces ### PLEASE CHECK If logic correct / desired ######## 
    return ' '.join(re.sub(r'[^A-Za-z0-9]', ' ', re.sub(r'\b(a|an|the)\b', ' ', s.lower())).split())

def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()
        common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
        num_common = sum(common_tokens.values())
        
        prec = (num_common / len(pred_tokens))
        rec = (num_common / len(truth_tokens)) 

        f1_scores.append( 0 if prec+rec == 0 else (2*(prec*rec)/(prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)
    
    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)

In [None]:
def metrics(annotation_answers, rag_answers):
    f1_scores, exact_matches, recall_scores = [], [], []
    
    for pred, truth in zip(annotation_answers, rag_answers):
        pred_tokens, truth_tokens = normalize_answer(pred).split(), normalize_answer(truth).split()

        # Avoid division by zero by checking if pred_tokens or truth_tokens are empty
        if len(pred_tokens) == 0 or len(truth_tokens) == 0:
            prec = 0
            rec = 0
        else:
            common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
            num_common = sum(common_tokens.values())
            prec = (num_common / len(pred_tokens))
            rec = (num_common / len(truth_tokens))

        f1_scores.append(0 if prec + rec == 0 else (2 * (prec * rec) / (prec + rec)))
        exact_matches.append(int(pred_tokens == truth_tokens))
        recall_scores.append(rec if truth_tokens else 0)

    return np.mean(f1_scores), np.mean(exact_matches), np.mean(recall_scores)

In [None]:
def getFileData(filename):
    f = open(filename, "r")
    data = f.readlines()
    f.close()
    data = [i.strip() for i in data]
    return data


In [None]:
import os
directories = ["Web Scholar PDFs", "About Scottie", "Buggy News", "academic_calendars", "history_of_cmu", "history_of_scs", "Kiltie Band", "lti_faculty", "lti_programs", "program_handbooks", "Tartan Facts", "courses"]


questions = {}
for i in directories:
    if "annotation.txt" in os.listdir("Data/"+i):
        f = open("Data/"+i + "/annotation.txt")
        lines = f.readlines()
        f.close()

        q = lines[0::5]
        t = lines[3::5]


        for count, j in enumerate(q):
            questions[j[3:].strip()] = (t[count][3:].strip())

In [None]:
f = open("SubmissionData/test/questions.txt", "r")
ref_qs = [i.strip() for i in f.readlines()]
f.close()

def getPairsByType(ref_qs, annotation_answers, rag_answers):
    pairs = {}

    

    for count, i in enumerate(ref_qs):
        if i in questions:
            t = questions[i]
            if t in pairs:
                pairs[t][0].append(rag_answers[count])
                pairs[t][1].append(annotation_answers[count])
            else:
                pairs[t] = [[], []]
                pairs[t][0] = [rag_answers[count]]
                pairs[t][1] = [annotation_answers[count]]
            

    return pairs


def getRecallByTypes(s, questions):
    keyFreq = {}
    for i in questions:
        if questions[i] in keyFreq:
            keyFreq[questions[i]] += 1
        else:
            keyFreq[questions[i]] = 1
            
    res = {}
    for i in s:
        _, _, recall = metrics(s[i][0], s[i][1])
        res[i] = recall/keyFreq[i]

    

    return [(str(i), res[str(i)]) for i in sorted([int(i) for i in res.keys()])]

In [None]:
refrence_answers = getFileData("annotations/answers_1.txt")