# Out-of-Scope Testing Dataset Generator

## Webscraper
    uses bs4 to extract qa pairs from the swinburne online faq page.

In [5]:
#Webscraper
#import libraries
import requests
from bs4 import BeautifulSoup

#get faq q a pairs from swinburne online faq
def get_faqs():
    #sends http request to swinburne online faq webpage to create Beautiful Soup object
    response = requests.get('https://www.swinburneonline.edu.au/faqs/')
    soup = BeautifulSoup(response.content, 'html.parser')
    #grab all elements of class faqs-group and card and assign them to faq cards.
    faqs_cards = soup.select('.faqs-group .card')
    result = []
    #gets questions and answers from faq cards and places them into result list
    for faq in faqs_cards:
        question = faq.select_one('.card-header h5 > div:nth-child(2)')
        answer = faq.select_one('.card-body .content')
        # add to result if question and answer exist
        if question and answer:
            question = question.get_text(strip=True)
            answer = answer.get_text(strip=True)
            result.append((question, answer))
    return result

## Paraphrase Generator, CSV formatter, and SQUAD adder
    adds webscraped data to faq_data.csv. uses humarin gpt-T5 model to generate 4 unique iterations of each qa pair and adds them to the file. combines this with a set of around 120 qa pairs from the squad validation dataset.

    

In [9]:
import csv
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import random

#get the faq data
faq_data = get_faqs()

# Load the human model for paraphrasing
model_name = "humarin/chatgpt_paraphraser_on_T5_base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=1, num_beams=10):
    # tokenize the text to be form of a list of token IDs
    inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
    # generate the paraphrased sentences
    outputs = model.generate(
        **inputs,
        max_length=150,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        temperature = 1.4
    )

    # decode the generated sentences using the tokenizer to get them back to text
    paraphrased_sentences = [s.strip() for s in tokenizer.batch_decode(outputs, skip_special_tokens=True)]

    # remove duplicate sentences
    unique_sentences = list(set(paraphrased_sentences))

    return unique_sentences


#define the csv file path and name
csv_file_path = "faq_data.csv"


#open the csv file for writing
with open(csv_file_path, "w", newline="", encoding="utf-8") as csv_file:
    #create a csv writer object
    csv_writer = csv.writer(csv_file)

    #write the header row
    csv_writer.writerow(["Question", "Answer", "Label"])


    #write each faq as a row in the csv file and add label
    for i, faq in enumerate(faq_data):
        question, answer = faq

        #label of 0 corresponds to in scope
        label = "0"
        csv_writer.writerow([question, answer, label])

        # Generate 4 unique paraphrased versions of the question and answer
        paraphrased_questions = set()
        paraphrased_answers = set()
        num_paraphrased = 0
        while num_paraphrased < 4:
            # Generate a paraphrased version of the question
            paraphrased_question = get_paraphrased_sentences(model, tokenizer, question, num_beams=5, num_return_sequences=1)[0]
            if paraphrased_question not in paraphrased_questions:
                paraphrased_questions.add(paraphrased_question)
                num_paraphrased += 1
                print(num_paraphrased)
            
            # Generate a paraphrased version of the answer
            paraphrased_answer = get_paraphrased_sentences(model, tokenizer, answer, num_beams=5, num_return_sequences=1)[0]
            if paraphrased_answer not in paraphrased_answers:
                paraphrased_answers.add(paraphrased_answer)
                num_paraphrased += 1
                print(num_paraphrased)

            # Write the paraphrased versions to the CSV file
            csv_writer.writerow([paraphrased_question, paraphrased_answer, label])
            print(f"Added question: {paraphrased_question}, answer: {paraphrased_answer}")
    
    
    #copies 80 random qa pairs from squad
    validation_squad_path = "validation-squad.csv"
    with open(validation_squad_path, newline='') as squad:
        csv_reader = csv.reader(squad)
        next(csv_reader)  # skip the header row
        # Randomly select 80 rows from the dataset
        selected_rows = random.sample(list(csv_reader), 120)
        
        # Write the question-answer pair to the existing CSV file with a label of "1"
        for selected_row in selected_rows:
            csv_writer.writerow([selected_row[2], selected_row[5], "1"])

    
    csv_file.close()




1
2
Added question: What kind of support can I expect?, answer: Swinburne Online students can count on Student Advisors to provide 24-hour support, 24-hour support, and access to online tutors for each unit, research advice, and technology support. Learn more about Swinburne Online.
Added question: What kind of support can I expect?, answer: Swinburne Online students can count on Student Advisors to provide 24-hour support, 24-hour support, and access to online tutors for each unit, research advice, and technology support. Learn more about Swinburne Online.
Added question: What kind of support can I expect?, answer: Swinburne Online students can count on Student Advisors to provide 24-hour support, 24-hour support, and access to online tutors for each unit, research advice, and technology support. Learn more about Swinburne Online.
Added question: What kind of support can I expect?, answer: Swinburne Online students can count on Student Advisors to provide 24-hour support, 24-hour supp

KeyboardInterrupt: 