Importing Libraries Needed

In [41]:
# Standard Library Imports
import os
import warnings
warnings.filterwarnings("ignore", message="This sequence already has </s>.")

import math
from collections import Counter

# Third-Party Libraries
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet
from nltk.util import ngrams
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
    PreTrainedTokenizerFast,
    AutoModelForQuestionAnswering
)
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\justi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

File Paths

In [42]:
# Models

T5QG_MODEL_DIR = os.path.join('models', 'qg_model')
T5QG_TOKENIZER_DIR = os.path.join('models', 'qg_tokenizer')
T5AG_MODEL_DIR = os.path.join('models', 't5_model')
T5AG_TOKENIZER_DIR = os.path.join('models', 't5_tokenizer')

# dataset

dataset_path = os.path.join('datasets', 'generated_qa.csv')

Importing Models

In [43]:
# Answer Generation Models
agmodel1 = T5ForConditionalGeneration.from_pretrained(T5AG_MODEL_DIR) # valhalla/t5-base-qg-hl (finetuned)
agmodel1_tokenizer = T5Tokenizer.from_pretrained(T5AG_TOKENIZER_DIR)

agmodel2 = AutoModelForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')
agmodel2_tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')

agmodel3 = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
agmodel3_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    
""" =============================================================== """

# Question Generation Models
qgmodel1 = T5ForConditionalGeneration.from_pretrained(T5QG_MODEL_DIR) # t5-large (default)
qgmodel1_tokenizer = T5Tokenizer.from_pretrained(T5QG_TOKENIZER_DIR)

qgmodel2 = T5ForConditionalGeneration.from_pretrained("iarfmoose/t5-base-question-generator")
qgmodel2_tokenizer = T5Tokenizer.from_pretrained("iarfmoose/t5-base-question-generator")

qgmodel3 = T5ForConditionalGeneration.from_pretrained('Sehong/t5-large-QuestionGeneration')
qgmodel3_tokenizer = PreTrainedTokenizerFast.from_pretrained('Sehong/t5-large-QuestionGeneration')

""" =============================================================== """

# Sentence Transformer Models
stmodel1 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

stmodel2 = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v2")

stmodel3 = SentenceTransformer('sentence-transformers/LaBSE') # sentence-transformers/LaBSE

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated

Data Preparation

In [44]:
df = pd.read_csv(dataset_path)

nlength = 100 # 1000 (higher much better)

df_sample = df.sample(n = nlength, random_state=42)

ref_contexts = df_sample['context'].tolist()
ref_answers = df_sample['answer'].tolist()
bleu_ref_answers = [[ref] for ref in ref_answers]
rouge_ref = [[ref] for ref in ref_answers]
meteor_ref = [[ref] for ref in ref_answers]
ref_questions = df_sample['question'].tolist()
bleu_ref_questions = [[ref] for ref in ref_questions]

In [45]:
def encode_answer_generation_model1(context, question):
    input_text = f"question: {question} context: {context}"
    input_ids = agmodel1_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    with torch.no_grad():
        output = agmodel1.generate(input_ids, max_length=512, num_return_sequences=1)

    return agmodel1_tokenizer.decode(output[0], skip_special_tokens=True)


In [46]:
answer_generation_model1_hyp = []

with tqdm(total=len(ref_answers), desc="Generating Predictions for Answer Generation Model 1") as pbar:
    for answer, question, context in zip(ref_answers, ref_questions, ref_contexts):
        outputs = encode_answer_generation_model1(context, question)
        answer_generation_model1_hyp.append(outputs)
        pbar.update(1)

Generating Predictions for Answer Generation Model 1: 100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


In [47]:
def encode_answer_generation_model2(context, question):
    input_text = question + agmodel2_tokenizer.sep_token + context

    input_text = input_text[:agmodel2_tokenizer.model_max_length]

    encoding = agmodel2_tokenizer(input_text, return_tensors="pt")

    outputs = agmodel2(**encoding)

    predicted_start_idx = outputs.start_logits.argmax(-1).item()
    predicted_end_idx = outputs.end_logits.argmax(-1).item()
    predicted_answer = agmodel2_tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])

    return predicted_answer

In [48]:
answer_generation_model2_hyp = []

for context, question in tqdm(zip(ref_contexts, ref_questions), desc = "Generating Predictions for Answer Generation Model 2", total=len(df_sample)):
    answer = encode_answer_generation_model2(context, question)
    answer_generation_model2_hyp.append(answer)

Generating Predictions for Answer Generation Model 2: 100%|██████████| 100/100 [00:19<00:00,  5.26it/s]


In [49]:
def encode_answer_generation_model3(context, question):
    inputs = agmodel3_tokenizer(question, context, truncation=True, max_length=512, return_tensors='pt')
    
    inputs = {k: v.to(agmodel3.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = agmodel3(**inputs)
    
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    answer = agmodel3_tokenizer.convert_tokens_to_string(agmodel3_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index+1]))
    return answer

In [50]:
answer_generation_model3_hyp = []

for context, question in tqdm(zip(ref_contexts, ref_questions), desc="Generating Predictions for Answer Generation Model 3", total=len(df_sample)):
    answer = encode_answer_generation_model3(context, question)
    answer_generation_model3_hyp.append(answer)

Generating Predictions for Answer Generation Model 3: 100%|██████████| 100/100 [01:10<00:00,  1.41it/s]


In [51]:
def encode_question_generation_model1(context, answer):
    answer_span = context.replace(answer, f"<hl>{answer}<hl>") + "</s>"
    inputs = qgmodel1_tokenizer(answer_span, return_tensors="pt")
    question = qgmodel1.generate(input_ids=inputs.input_ids, max_length=50)[0]

    return qgmodel1_tokenizer.decode(question, skip_special_tokens=True)

In [52]:
question_generation_model1_hyp = []

for context, answer in tqdm(zip(ref_contexts, ref_answers), desc="Generating Predictions for Question Generation Model 1", total=len(df_sample)):
    question = encode_question_generation_model1(context, answer)
    question_generation_model1_hyp.append(question)

Generating Predictions for Question Generation Model 1:  85%|████████▌ | 85/100 [01:21<00:13,  1.14it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (970 > 512). Running this sequence through the model will result in indexing errors
Generating Predictions for Question Generation Model 1: 100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


In [53]:
def encode_question_generation_model2(context):
    input_text = f"Generate a question from the context: {context} Answer: {answer}"
        
    inputs = qgmodel3_tokenizer.encode(input_text, return_tensors="pt")

    outputs = qgmodel3.generate(inputs, max_length=512, num_beams= 5, early_stopping=True)
        
    question = qgmodel3_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
    return question


In [54]:
question_generation_model2_hyp = []

for context in tqdm(zip(ref_contexts), desc="Generating Predictions for Question Generation Model 2", total=len(df_sample)):
    question = encode_question_generation_model2(context)
    question_generation_model2_hyp.append(question)

Generating Predictions for Question Generation Model 2: 100%|██████████| 100/100 [08:25<00:00,  5.06s/it]


In [55]:
def encode_question_generation_model3(context, answer):
    input_text = f"question: context: {context} answer: {answer}"
    inputs = qgmodel3_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = qgmodel3.generate(
        inputs["input_ids"],
        max_length=50,
        num_beams=5,
        num_return_sequences=1
    )
    question = qgmodel3_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [56]:
question_generation_model3_hyp = []

for context, answer in tqdm(zip(ref_contexts, ref_answers), desc="Generating Predictions for Question Generation Model 3", total=len(df_sample)):
    question = encode_question_generation_model3(context, answer)
    question_generation_model3_hyp.append(question)

Generating Predictions for Question Generation Model 3: 100%|██████████| 100/100 [10:21<00:00,  6.22s/it]


In [57]:
def evaluate_sentence_transformers(ref_contexts):
    embeddings1 = stmodel1.encode(ref_contexts)
    embeddings2 = stmodel2.encode(ref_contexts)
    embeddings3 = stmodel3.encode(ref_contexts)

    similarity_matrix1 = cosine_similarity(embeddings1)
    similarity_matrix2 = cosine_similarity(embeddings2)
    similarity_matrix3 = cosine_similarity(embeddings3)

    return {
        "Model1 (all-MiniLM-L6-v2)": similarity_matrix1,
        "Model2 (msmarco-distilbert-base-v2)": similarity_matrix2,
        "Model3 (LaBSE)": similarity_matrix3
    }

def average_cosine_similarity(cos_sim_matrix):
    avg_cos_sim = np.mean(cos_sim_matrix)
    avg_cos_sim_percentage = avg_cos_sim * 100
    return avg_cos_sim_percentage

Metrics Formula

In [58]:
# BLEU Metric

def compute_bleu_score(reference, hypothesis, n=4):
    def _ngram_counts(tokens, n):
        return Counter(ngrams(tokens, n))

    def _precision(hyp_counts, ref_counts):
        return sum((hyp_counts & ref_counts).values()) / sum(hyp_counts.values()) if sum(hyp_counts.values()) > 0 else 0.0

    def _brevity_penalty(ref_len, hyp_len):
        if hyp_len > ref_len:
            return 1.0
        return math.exp(1 - ref_len / hyp_len) if hyp_len > 0 else 0.0

    def _normalize(tokens):
        return [token.lower() for token in tokens]

    hyp_tokens = _normalize(hypothesis)
    ref_tokens_list = [_normalize(ref) for ref in reference]
    
    precisions = []
    for i in range(1, n+1):
        hyp_ngrams = _ngram_counts(hyp_tokens, i)
        ref_ngrams_list = [ _ngram_counts(ref, i) for ref in ref_tokens_list ]
        
        ref_ngrams = Counter()
        for ref_ngram in ref_ngrams_list:
            ref_ngrams |= ref_ngram

        precision = _precision(hyp_ngrams, ref_ngrams)
        precisions.append(precision)

    if any(p == 0 for p in precisions):
        return 0.0

    bleu_score = math.exp(sum(math.log(p) for p in precisions) / n)
    bp = _brevity_penalty(len(ref_tokens_list[0]), len(hyp_tokens))
    return bp * bleu_score

In [59]:
# ROUGE Metrics

def count_ngrams(text, n):
    ngrams = Counter()
    words = text.split()
    for i in range(len(words) - n + 1):
        ngram = tuple(words[i:i+n])
        ngrams[ngram] += 1
    return ngrams

def rouge_n(candidate, reference, n):
    candidate_ngrams = count_ngrams(candidate, n)
    reference_ngrams = count_ngrams(reference, n)
    
    match_count = 0
    for ngram in reference_ngrams:
        if ngram in candidate_ngrams:
            match_count += min(reference_ngrams[ngram], candidate_ngrams[ngram])
    
    if sum(reference_ngrams.values()) == 0:
        return 0.0
    
    return match_count / sum(reference_ngrams.values())

def lcs(X, Y):
    m, n = len(X), len(Y)
    table = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m):
        for j in range(n):
            if X[i] == Y[j]:
                table[i + 1][j + 1] = table[i][j] + 1
            else:
                table[i + 1][j + 1] = max(table[i + 1][j], table[i][j + 1])
    return table[m][n]

def rouge_l(candidate, reference):
    candidate_words = candidate.split()
    reference_words = reference.split()
    lcs_length = lcs(candidate_words, reference_words)
    
    if len(candidate_words) == 0 or len(reference_words) == 0:
        return 0.0
    
    precision = lcs_length / len(candidate_words)
    recall = lcs_length / len(reference_words)
    
    if precision + recall == 0:
        return 0.0
    
    f1_score = (2 * precision * recall) / (precision + recall)
    return f1_score

def calculate_rouge_scores(candidates, references):
    assert len(candidates) == len(references), "Candidates and references must be of the same length"
    
    rouge_1_scores = []
    rouge_2_scores = []
    for candidate, reference in zip(candidates, references):
        rouge_1_scores.append(rouge_n(candidate, reference, 1))
        rouge_2_scores.append(rouge_n(candidate, reference, 2))
    
    rouge_l_scores = []
    for candidate, reference in zip(candidates, references):
        rouge_l_scores.append(rouge_l(candidate, reference))
    
    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    
    avg_rouge_1 *= 100
    avg_rouge_2 *= 100
    avg_rouge_l *= 100
    
    return avg_rouge_1, avg_rouge_2, avg_rouge_l


In [60]:
# METEOR Metric

def tokenize(text):
    return nltk.word_tokenize(text.lower())

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def precision_recall(candidate_tokens, reference_tokens):
    candidate_counter = Counter(candidate_tokens)
    reference_counter = Counter(reference_tokens)

    matches = sum(min(candidate_counter[token], reference_counter[token]) for token in candidate_counter)
    precision = matches / len(candidate_tokens) if candidate_tokens else 0.0
    
    recall = matches / len(reference_tokens) if reference_tokens else 0.0
    
    return precision, recall

def meteor_score(candidate, references):
    candidate_tokens = tokenize(candidate)
    reference_tokens = [tokenize(ref) for ref in references]
    
    best_f1 = 0.0
    
    for ref_tokens in reference_tokens:
        precision, recall = precision_recall(candidate_tokens, ref_tokens)
        
        if precision + recall > 0:
            f1_score = (2 * precision * recall) / (precision + recall)
            best_f1 = max(best_f1, f1_score)
    
    return best_f1

def calculate_meteor_scores(candidates, references):
    assert len(candidates) == len(references), "Candidates and references must be of the same length"
    
    meteor_scores = []
    for candidate, reference_list in zip(candidates, references):
        meteor_scores.append(meteor_score(candidate, reference_list))
    
    return sum(meteor_scores) / len(meteor_scores)


BLEU Evaluation

In [61]:
"""Answer Generation Models"""

total_score_for_agm1 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(answer_generation_model1_hyp, bleu_ref_answers)), desc="Computing BLEU Score of Answer Generation Model 1", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_agm1 += score

percentage_bleu_score_for_agm1  = ( total_score_for_agm1 / len(df_sample) ) * 100

total_score_for_agm2 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(answer_generation_model2_hyp, bleu_ref_answers)), desc="Computing BLEU Score of Answer Generation Model 2", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_agm2 += score

percentage_bleu_score_for_agm2  = ( total_score_for_agm2 / len(df_sample) ) * 100


total_score_for_agm3 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(answer_generation_model3_hyp, bleu_ref_answers)), desc="Computing BLEU Score of Answer Generation Model 3", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_agm3 += score

percentage_bleu_score_for_agm3  = ( total_score_for_agm3 / len(df_sample) ) * 100


""" =============================================================== """

"""Question Generation Models"""

total_score_for_qgm1 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(question_generation_model1_hyp, bleu_ref_questions)), desc="Computing BLEU Score of Question Generation Model 1", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_qgm1 += score

percentage_bleu_score_for_qgm1  = ( total_score_for_qgm1 / len(df_sample) ) * 100


total_score_for_qgm2 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(question_generation_model2_hyp, bleu_ref_questions)), desc="Computing BLEU Score of Question Generation Model 2", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_qgm2 += score

percentage_bleu_score_for_qgm2  = ( total_score_for_qgm2 / len(df_sample) ) * 100


total_score_for_qgm3 = 0.0

for i, (hypothesis, references) in tqdm(enumerate(zip(question_generation_model3_hyp, bleu_ref_questions)), desc="Computing BLEU Score of Question Generation Model 3", total=len(df_sample)):
    score = compute_bleu_score(references, hypothesis)
    total_score_for_qgm3 += score

percentage_bleu_score_for_qgm3  = ( total_score_for_qgm3 / len(df_sample) ) * 100


Computing BLEU Score of Answer Generation Model 1: 100%|██████████| 100/100 [00:00<00:00, 4083.28it/s]
Computing BLEU Score of Answer Generation Model 2: 100%|██████████| 100/100 [00:00<00:00, 12534.53it/s]
Computing BLEU Score of Answer Generation Model 3: 100%|██████████| 100/100 [00:00<00:00, 3097.00it/s]
Computing BLEU Score of Question Generation Model 1: 100%|██████████| 100/100 [00:00<00:00, 5268.10it/s]
Computing BLEU Score of Question Generation Model 2: 100%|██████████| 100/100 [00:00<00:00, 5888.23it/s]
Computing BLEU Score of Question Generation Model 3: 100%|██████████| 100/100 [00:00<00:00, 4767.61it/s]


ROUGE Evaluation

In [62]:
"""Answer Generation Models"""

avg_rouge_1_for_agm1, avg_rouge_2_for_agm1, avg_rouge_l_for_agm1 = calculate_rouge_scores(answer_generation_model1_hyp, ref_answers)

avg_rouge_1_for_agm2, avg_rouge_2_for_agm2, avg_rouge_l_for_agm2 = calculate_rouge_scores(answer_generation_model2_hyp, ref_answers)

avg_rouge_1_for_agm3, avg_rouge_2_for_agm3, avg_rouge_l_for_agm3 = calculate_rouge_scores(answer_generation_model3_hyp, ref_answers)

""" =============================================================== """

"""Question Generation Models"""

avg_rouge_1_for_qgm1, avg_rouge_2_for_qgm1, avg_rouge_l_for_qgm1 = calculate_rouge_scores(question_generation_model1_hyp, ref_questions)

avg_rouge_1_for_qgm2, avg_rouge_2_for_qgm2, avg_rouge_l_for_qgm2 = calculate_rouge_scores(question_generation_model2_hyp, ref_questions)

avg_rouge_1_for_qgm3, avg_rouge_2_for_qgm3, avg_rouge_l_for_qgm3 = calculate_rouge_scores(question_generation_model3_hyp, ref_questions)


METEOR Evaluation

In [63]:
"""Answer Generation Models"""

average_meteor_score_for_agm1 = calculate_meteor_scores(answer_generation_model1_hyp, meteor_ref)
percentage_meteor_score_for_agm1 = average_meteor_score_for_agm1 * 100

average_meteor_score_for_agm2 = calculate_meteor_scores(answer_generation_model2_hyp, meteor_ref)
percentage_meteor_score_for_agm2 = average_meteor_score_for_agm2 * 100

average_meteor_score_for_agm3 = calculate_meteor_scores(answer_generation_model3_hyp, meteor_ref)
percentage_meteor_score_for_agm3 = average_meteor_score_for_agm3 * 100

""" =============================================================== """

"""Question Generation Models"""

average_meteor_score_for_qgm1 = calculate_meteor_scores(question_generation_model1_hyp, meteor_ref)
percentage_meteor_score_for_qgm1 = average_meteor_score_for_agm1 * 100

average_meteor_score_for_qgm2 = calculate_meteor_scores(question_generation_model2_hyp, meteor_ref)
percentage_meteor_score_for_qgm2 = average_meteor_score_for_agm2 * 100

average_meteor_score_for_qgm3 = calculate_meteor_scores(question_generation_model3_hyp, meteor_ref)
percentage_meteor_score_for_qgm3 = average_meteor_score_for_agm3 * 100


Cosine Similarity for Sentence Transformers Model

In [64]:
results = evaluate_sentence_transformers(ref_contexts)
avg_cos_sim1 = average_cosine_similarity(results["Model1 (all-MiniLM-L6-v2)"])
avg_cos_sim2 = average_cosine_similarity(results["Model2 (msmarco-distilbert-base-v2)"])
avg_cos_sim3 = average_cosine_similarity(results["Model3 (LaBSE)"])

Evaluation Report

In [65]:
print(f"""
      
=================== BLEU Scores for Answer Generation Models =====================================
      
      
Total BLEU Score for Answer Generation Model 1: {percentage_bleu_score_for_agm1:.2f}%")

Total BLEU Score for Answer Generation Model 2: {percentage_bleu_score_for_agm2:.2f}%")

Total BLEU Score for Answer Generation Model 3: {percentage_bleu_score_for_agm3:.2f}%")


================== ROUGE Scores for Answer Generation Models =====================================


Average ROUGE-1 Score of Answer Generation Model 1 : {avg_rouge_1_for_agm1:.2f}%
Average ROUGE-2 Score of Answer Generation Model 1 : {avg_rouge_2_for_agm1:.2f}%
Average ROUGE-L Score of Answer Generation Model 1 : {avg_rouge_l_for_agm1:.2f}%

Average ROUGE-1 Score of Answer Generation Model 2 : {avg_rouge_1_for_agm2:.2f}%
Average ROUGE-2 Score of Answer Generation Model 2 : {avg_rouge_2_for_agm2:.2f}%
Average ROUGE-L Score of Answer Generation Model 2 : {avg_rouge_l_for_agm2:.2f}%

Average ROUGE-1 Score of Answer Generation Model 3 : {avg_rouge_1_for_agm3:.2f}%
Average ROUGE-2 Score of Answer Generation Model 3 : {avg_rouge_2_for_agm3:.2f}%
Average ROUGE-L Score of Answer Generation Model 3 : {avg_rouge_l_for_agm3:.2f}%


================= METEOR Scores for Answer Generation Models =====================================


Total METEOR Score for Answer Generation Model 1: {percentage_meteor_score_for_agm1:.2f}%")

Total METEOR Score for Answer Generation Model 2: {percentage_meteor_score_for_agm2:.2f}%")

Total METEOR Score for Answer Generation Model 3: {percentage_meteor_score_for_agm3:.2f}%")


================= BLEU Scores for Question Generation Models =====================================


Total BLEU Score for Question Generation Model 1: {percentage_bleu_score_for_qgm1:.2f}%")

Total BLEU Score for Question Generation Model 2: {percentage_bleu_score_for_qgm2:.2f}%")

Total BLEU Score for Question Generation Model 3: {percentage_bleu_score_for_qgm3:.2f}%")


======================== ROUGE Scores for Question Generation Models =============================

Average ROUGE-1 Score of Question Generation Model 1 : {avg_rouge_1_for_qgm1:.2f}%
Average ROUGE-2 Score of Question Generation Model 1 : {avg_rouge_2_for_qgm1:.2f}%
Average ROUGE-L Score of Question Generation Model 1 : {avg_rouge_l_for_qgm1:.2f}%

Average ROUGE-1 Score of Question Generation Model 2 : {avg_rouge_1_for_qgm2:.2f}%
Average ROUGE-2 Score of Question Generation Model 2 : {avg_rouge_2_for_qgm2:.2f}%
Average ROUGE-L Score of Question Generation Model 2 : {avg_rouge_l_for_qgm2:.2f}%

Average ROUGE-1 Score of Question Generation Model 3 : {avg_rouge_1_for_qgm3:.2f}%
Average ROUGE-2 Score of Question Generation Model 3 : {avg_rouge_2_for_qgm3:.2f}%
Average ROUGE-L Score of Question Generation Model 3 : {avg_rouge_l_for_qgm3:.2f}%


================= METEOR Scores for Question Generation Models =====================================


Total METEOR Score for Question Generation Model 1: {percentage_meteor_score_for_qgm1:.2f}%")

Total METEOR Score for Question Generation Model 2: {percentage_meteor_score_for_qgm2:.2f}%")

Total METEOR Score for Question Generation Model 3: {percentage_meteor_score_for_qgm3:.2f}%")


================================ Sentence Transformer Models =====================================

Total Cosine Similarity of Sentence Transformer Model 1 : {avg_cos_sim1:.2f}%

Total Cosine Similarity of Sentence Transformer Model 2 : {avg_cos_sim2:.2f}%

Total Cosine Similarity of Sentence Transformer Model 3 : {avg_cos_sim3:.2f}%

""")





      
      
      
Total BLEU Score for Answer Generation Model 1: 53.17%")

Total BLEU Score for Answer Generation Model 2: 34.19%")

Total BLEU Score for Answer Generation Model 3: 53.44%")




Average ROUGE-1 Score of Answer Generation Model 1 : 59.60%
Average ROUGE-2 Score of Answer Generation Model 1 : 37.39%
Average ROUGE-L Score of Answer Generation Model 1 : 56.79%

Average ROUGE-1 Score of Answer Generation Model 2 : 41.32%
Average ROUGE-2 Score of Answer Generation Model 2 : 24.68%
Average ROUGE-L Score of Answer Generation Model 2 : 40.94%

Average ROUGE-1 Score of Answer Generation Model 3 : 37.07%
Average ROUGE-2 Score of Answer Generation Model 3 : 18.47%
Average ROUGE-L Score of Answer Generation Model 3 : 34.77%




Total METEOR Score for Answer Generation Model 1: 58.68%")

Total METEOR Score for Answer Generation Model 2: 40.77%")

Total METEOR Score for Answer Generation Model 3: 54.57%")




Total BLEU Score for Question Generation Model 1: 42.14%")

Total BLEU S

In [67]:
# summary of model selection and insights

print(f"""
Summary of Model Selection and Insights

Answer Generation Models:
----------------------------------------
Model 1 has been selected due to its consistent and high performance across multiple metrics:
  - BLEU Score: {percentage_bleu_score_for_agm1:.2f}%, indicating strong alignment with reference texts.
  - ROUGE Scores: High ROUGE-1 ({avg_rouge_1_for_agm1:.2f}%) and ROUGE-L scores ({avg_rouge_l_for_agm1:.2f}%) suggest good overlap in individual words and longer phrases. However, ROUGE-2 score of {avg_rouge_2_for_agm1:.2f}% indicates a lack of bigram overlap.
  - METEOR Score: {percentage_meteor_score_for_agm1:.2f}%, supports effectiveness in capturing meaning and semantic similarity.

Comparison with Other Models:
  - Model 2 shows lower performance across BLEU ({percentage_bleu_score_for_agm2:.2f}%), ROUGE (for ROUGE-1 ({avg_rouge_1_for_agm2:.2f}%) and ROUGE-L ({avg_rouge_l_for_agm2:.2f}%)), and METEOR ({average_meteor_score_for_agm2:.2f}%). It seems less effective in generating matching answers and capturing meaning.
  - Model 3 has similar BLEU and METEOR scores as Model 1 but lower ROUGE scores, indicating good fluency but less detailed content overlap.

Question Generation Models:
----------------------------------------
Model 1 is chosen for its balanced performance:
  - BLEU Score: {percentage_bleu_score_for_qgm1:.2f}%, indicative of its ability to generate coherent questions that align with reference structures.
  - ROUGE Scores: ROUGE-1 and ROUGE-L scores ({avg_rouge_1_for_qgm1:.2f}% and {avg_rouge_l_for_qgm1:.2f}% respectively) show reasonable match with reference questions, though ROUGE-2 ({avg_rouge_2_for_qgm1:.2f}%) suggests room for improvement in capturing bigram matches.
  - METEOR Score: {percentage_meteor_score_for_qgm1:.2f}%, reflects good semantic alignment and content coverage.

Comparison with Other Models:
  - Model 2 has lower BLEU ({percentage_bleu_score_for_qgm2:.2f}%) and METEOR scores ({percentage_meteor_score_for_qgm2:.2f}%), suggesting it may not capture the intended question structures as effectively.
  - Model 3 performs slightly better in BLEU ({percentage_bleu_score_for_qgm3:.2f}%) but has lower ROUGE and METEOR scores, indicating higher syntactic similarity but less semantic relevance.

Sentence Transformers Models:
----------------------------------------
Model 3 is preferred due to its highest cosine similarity score ({avg_cos_sim3:.2f}%), indicating the best semantic similarity and coherence among the embeddings.

Comparison with Other Models:
  - Model 1 and Model 2 have lower cosine similarity scores ({avg_cos_sim1:.2f}% and {avg_cos_sim2:.2f}%, respectively), indicating they might not capture semantic relationships as effectively as Model 3.

Overall Insight:
----------------------------------------
Choosing Model 1 for answer and question generation will likely result in improved performance due to high BLEU, ROUGE, and METEOR scores, ensuring high-quality and contextually relevant outputs.
sentence transformer Model 3 is the top choice for sentence embeddings, providing superior semantic similarity for effective text understanding and comparison.
""")



Summary of Model Selection and Insights

Answer Generation Models:
----------------------------------------
Model 1 has been selected due to its consistent and high performance across multiple metrics:
  - BLEU Score: 53.17%, indicating strong alignment with reference texts.
  - ROUGE Scores: High ROUGE-1 (59.60%) and ROUGE-L scores (56.79%) suggest good overlap in individual words and longer phrases. However, ROUGE-2 score of 37.39% indicates a lack of bigram overlap.
  - METEOR Score: 58.68%, supports effectiveness in capturing meaning and semantic similarity.

Comparison with Other Models:
  - Model 2 shows lower performance across BLEU (34.19%), ROUGE (for ROUGE-1 (41.32%) and ROUGE-L (40.94%)), and METEOR (0.41%). It seems less effective in generating matching answers and capturing meaning.
  - Model 3 has similar BLEU and METEOR scores as Model 1 but lower ROUGE scores, indicating good fluency but less detailed content overlap.

Question Generation Models:
----------------------