Import Needed Libraries

In [46]:
import os
import warnings
warnings.filterwarnings("ignore", message="This sequence already has </s>.")

import pandas as pd
import json
from tqdm import tqdm
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    PreTrainedTokenizerFast,
)

from sklearn.feature_extraction.text import TfidfVectorizer


Initialize Path

In [4]:
# Models

T5QG_MODEL_DIR = os.path.join('..', 'src', 'models', 't5_base_questiongeneration_model')
T5QG_TOKENIZER_DIR = os.path.join('..', 'src', 'models', 't5_base_questiongeneration_tokenizer')

# Datasets

dataset_path = os.path.join('..', 'datasets', 'processed', 'generated_qa.csv')

Import Models

In [45]:
# Question Generation Models
qgmodel1 = T5ForConditionalGeneration.from_pretrained(T5QG_MODEL_DIR) # t5-base (default)
qgmodel1_tokenizer = T5Tokenizer.from_pretrained(T5QG_TOKENIZER_DIR)

qgmodel2 = T5ForConditionalGeneration.from_pretrained("iarfmoose/t5-base-question-generator")
qgmodel2_tokenizer = T5Tokenizer.from_pretrained("iarfmoose/t5-base-question-generator")

qgmodel3 = T5ForConditionalGeneration.from_pretrained('Sehong/t5-large-QuestionGeneration')
qgmodel3_tokenizer = PreTrainedTokenizerFast.from_pretrained('Sehong/t5-large-QuestionGeneration')


Process Dataset for Passage

In [26]:
df = pd.read_csv(dataset_path)

In [53]:
def word_count(text):
    return len(text.split())

# Filter rows where the 'context' column has more than 300 words
filtered_df = df[df['context'].apply(word_count) > 300]

# Drop duplicate contexts to ensure uniqueness
unique_contexts = filtered_df.drop_duplicates(subset='context')

# Select 20 different items
selected_items = unique_contexts['context'].head(20)

# Initialize a dictionary to store contexts and their keywords
context_keywords = {}

In [58]:
def get_keywords(passage, num_keywords=5):
    """Extract keywords using TF-IDF and return exactly num_keywords."""
    try:
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform([passage])
        feature_names = vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix.A.flatten()  # type: ignore
        word_scores = dict(zip(feature_names, tfidf_scores))
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        # Get the top num_keywords
        keywords = [word for word, _ in sorted_words[:num_keywords]]
        return keywords
    except Exception as e:
        print(f"Error extracting keywords: {e}")
        return []

In [60]:
for i, item in enumerate(selected_items, 1):
    keywords = get_keywords(item)
    context_keywords[item] = keywords

print("Collected Contexts and Keywords:")
for context, keywords in context_keywords.items():
    print(f"Context: {context[:200]}...")
    print(f"Keywords: {', '.join(keywords)}\n")

Collected Contexts and Keywords:
Context: On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on Oc...
Keywords: video, awards, beyoncé, 2009, album

Context: Forbes magazine began reporting on Beyoncé's earnings in 2008, calculating that the $80 million earned between June 2007 to June 2008, for her music, tour, films and clothing line made her the world's...
Keywords: million, beyoncé, year, celebrity, list

Context: In the spring of 1834, Chopin attended the Lower Rhenish Music Festival in Aix-la-Chapelle with Hiller, and it was there that Chopin met Felix Mendelssohn. After the festival, the three visited Düssel...
Keywords: chopin, maria, mendelssohn, met, leipzig

Context: Van Praag states that the Ming court established diplomatic delegations with Tibet merely to secure urgently needed horses. Wang and Nyima argue that these were not dipl

Question Generation Model 1 (t5 - base)

In [63]:
def encode_question_generation_model1(context, answer):
    """Generate questions for a given context and answer using model1 with beam search."""
    answer_span = context.replace(answer, f"<hl>{answer}<hl>") + "</s>"
    inputs = qgmodel1_tokenizer(answer_span, return_tensors="pt")

    # Generate questions using beam search
    questions = qgmodel1.generate(
        input_ids=inputs.input_ids, 
        max_length=70, 
    )

    # Decode and return questions
    return [qgmodel1_tokenizer.decode(question, skip_special_tokens=True) for question in questions]

# Define the list to store all generated questions
question_generation_model1_hyp = []

# Loop through each context and its associated keywords
for context, keywords in context_keywords.items():
    print(f"Context: {context[:200]}...")  # Display a snippet of the context
    print("Generated Questions:")
    
    # Store questions generated for each context
    generated_questions = []

    # Generate questions for each keyword
    for keyword in keywords:
        try:
            # Generate 5 questions per keyword
            questions = encode_question_generation_model1(context, keyword)
            generated_questions.extend(questions)
            
            # Print and append questions for each keyword
            for i, question in enumerate(questions, 1):
                print(f"Q{i} for '{keyword}': {question}")
                # Append each question to the main list
                question_generation_model1_hyp.append(question)
                
        except Exception as e:
            print(f"Error generating questions for keyword '{keyword}': {e}")

    # Optional: Print or store all generated questions for this context
    print("\nAll Questions for this Context:")
    for i, question in enumerate(generated_questions, 1):
        print(f"{i}. {question}")

    print("\n" + "-" * 50 + "\n")  # Separator between contexts

# Display the collected questions
print("Collected Questions:")
for i, question in enumerate(question_generation_model1_hyp, 1):
    print(f"{i}. {question}")



Context: On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on Oc...
Generated Questions:
Q1 for 'video': What type of video was the song "Single Ladies" montage based on?
Q1 for 'awards': What award did the video win at the 2009 MTV Video Music Awards?
Q1 for 'beyoncé': Beyonce's alter ego Sasha Fierce was released in November of 2008 in the US. The video for "Single Ladies" won several awards, including Video of the Year at the MTV Video Music Awards, and Video of the Year at the 2009 MTV Video Music Awards.
Q1 for '2009': In what year did Beyonce win Best Video at the MTV Video Music Awards?
Q1 for 'album': What was the name of Beyonce's third studio album?

All Questions for this Context:
1. What type of video was the song "Single Ladies" montage based on?
2. What award did the video win at the 2009 MTV Video Music Awards?
3. Beyonce's alte

Question Generation Model 2 (iarfmoose/t5-base-question-generator)

In [67]:
def encode_question_generation_model2(context, answer):
    """Generate multiple questions for a given context and answer using model2 with beam search."""
    input_text = f"Generate a question from the context: {context} Answer: {answer}"
        
    inputs = qgmodel3_tokenizer.encode(input_text, return_tensors="pt")

    # Generate questions using beam search with the specified number of return sequences
    outputs = qgmodel3.generate(
        inputs, 
        max_length=512,
        early_stopping=True
    )
    
    # Decode and return all generated questions
    questions = [qgmodel3_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

# Define the list to store all generated questions for model2
question_generation_model2_hyp = []

# Loop through each context and its associated keywords
for context, keywords in context_keywords.items():
    print(f"Context: {context[:200]}...")  # Display a snippet of the context
    print("Generated Questions from Model2:")
    
    # Store questions generated for each context
    generated_questions = []

    # Generate questions for each keyword
    for keyword in keywords:
        try:
            # Generate 5 questions per keyword using Model 2
            questions = encode_question_generation_model2(context, keyword)
            generated_questions.extend(questions)
            
            # Print and append questions for each keyword
            for i, question in enumerate(questions, 1):
                print(f"Q{i} for '{keyword}': {question}")
                # Append each question to the model2 list
                question_generation_model2_hyp.append(question)
                
        except Exception as e:
            print(f"Error generating questions for keyword '{keyword}': {e}")

    # Optional: Print or store all generated questions for this context
    print("\nAll Questions for this Context using Model2:")
    for i, question in enumerate(generated_questions, 1):
        print(f"{i}. {question}")

    print("\n" + "-" * 50 + "\n")  # Separator between contexts

# Display the collected questions for model2
print("Collected Questions from Model2:")
for i, question in enumerate(question_generation_model2_hyp, 1):
    print(f"{i}. {question}")



Context: On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on Oc...
Generated Questions from Model2:
Q1 for 'video': What was the title of Beyoncé's I Am... World Tour montage?
Q1 for 'awards': What did Beyoncé win on the I Am... World Tour?
Q1 for 'beyoncé': Who was the first female to win the Grammy for Best Female Video at the 2009 MTV Video Music Awards?
Q1 for '2009': When did Beyoncé begin her second world tour?
Q1 for 'album': What was the title of Beyoncé's 2009 tour?

All Questions for this Context using Model2:
1. What was the title of Beyoncé's I Am... World Tour montage?
2. What did Beyoncé win on the I Am... World Tour?
3. Who was the first female to win the Grammy for Best Female Video at the 2009 MTV Video Music Awards?
4. When did Beyoncé begin her second world tour?
5. What was the title of Beyoncé's 2009 tour?

--------------

Question Generation Model 3 (Sehong/t5-large-QuestionGeneration)

In [68]:
def encode_question_generation_model3(context, answer):
    """Generate multiple questions for a given context and answer using model3 with beam search."""
    input_text = f"question: context: {context} answer: {answer}"
    inputs = qgmodel3_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate multiple questions using beam search
    outputs = qgmodel3.generate(
        inputs["input_ids"],
        max_length=50,
        early_stopping=True
    )

    # Decode and return all generated questions
    questions = [qgmodel3_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

# Define the list to store all generated questions for model3
question_generation_model3_hyp = []

# Loop through each context and its associated keywords
for context, keywords in context_keywords.items():
    print(f"Context: {context[:200]}...")  # Display a snippet of the context
    print("Generated Questions from Model3:")
    
    # Store questions generated for each context
    generated_questions = []

    # Generate questions for each keyword
    for keyword in keywords:
        try:
            # Generate 5 questions per keyword using Model 3
            questions = encode_question_generation_model3(context, keyword)
            generated_questions.extend(questions)
            
            # Print and append questions for each keyword
            for i, question in enumerate(questions, 1):
                print(f"Q{i} for '{keyword}': {question}")
                # Append each question to the model3 list
                question_generation_model3_hyp.append(question)
                
        except Exception as e:
            print(f"Error generating questions for keyword '{keyword}': {e}")

    # Optional: Print or store all generated questions for this context
    print("\nAll Questions for this Context using Model3:")
    for i, question in enumerate(generated_questions, 1):
        print(f"{i}. {question}")

    print("\n" + "-" * 50 + "\n")  # Separator between contexts

# Display the collected questions for model3
print("Collected Questions from Model3:")
for i, question in enumerate(question_generation_model3_hyp, 1):
    print(f"{i}. {question}")



Context: On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on Oc...
Generated Questions from Model3:
Q1 for 'video': question: How much did the I Am... World Tour gross?
Q1 for 'awards': question: How much did the I Am... World Tour gross?
Q1 for 'beyoncé': question: How much did the I Am... World Tour gross?
Q1 for '2009': question: How much did the I Am... World Tour gross?
Q1 for 'album': question: How much did the I Am... World Tour gross?

All Questions for this Context using Model3:
1. question: How much did the I Am... World Tour gross?
2. question: How much did the I Am... World Tour gross?
3. question: How much did the I Am... World Tour gross?
4. question: How much did the I Am... World Tour gross?
5. question: How much did the I Am... World Tour gross?

--------------------------------------------------

Context: Forbes magazine beg

In [71]:
print(question_generation_model1_hyp)
print(question_generation_model2_hyp)
print(question_generation_model3_hyp)

['What type of video was the song "Single Ladies" montage based on?', 'What award did the video win at the 2009 MTV Video Music Awards?', 'Beyonce\'s alter ego Sasha Fierce was released in November of 2008 in the US. The video for "Single Ladies" won several awards, including Video of the Year at the MTV Video Music Awards, and Video of the Year at the 2009 MTV Video Music Awards.', 'In what year did Beyonce win Best Video at the MTV Video Music Awards?', "What was the name of Beyonce's third studio album?", 'How much did Beyonce earn in the past year for her clothing line?', "Beyonce earned an estimated $115 million in June 2014 - June 2014, the highest earnings to date. Beyonce's net worth is estimated to be $250 million as of May 2015.", 'In what year did Beyonce earn the highest amount of money?', 'Beyonce was published as the fourth most-powerful what in the Forbes rankings?', 'What list did Beyonce rank at in 2010?', 'What was the name of the family Chopin proposed to in 1836?', 

Output Prediction of the Models

In [73]:
data = {
    "model1_output": question_generation_model1_hyp,
    "model2_output": question_generation_model2_hyp,
    "model3_output": question_generation_model3_hyp
}

# Specify the desired path components
path = ("..", "outputs", "predictions", "questiongenerationmodelsoutput.json")

# Extract the directory path (excluding the file name)
directory_path = os.path.join(*path[:-1])  # This creates the path up to the directory, excluding the file

# Create the directory if it doesn't exist
os.makedirs(directory_path, exist_ok=True)

# Complete file path to the JSON file
file_path = os.path.join(*path)

# Check if the file exists
if not os.path.exists(file_path):
    # Create the JSON file if it doesn't exist
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)
else:
    # Load existing data and update it
    with open(file_path, "r") as f:
        existing_data = json.load(f)

    # Update the existing data with the new data
    for key in data:
        existing_data[key] = data[key]

    # Save the updated data
    with open(file_path, "w") as f:
        json.dump(existing_data, f, indent=4)
