# Out-of-Scope Testing Dataset Generator

## Webscraper
    uses bs4 to extract qa pairs from the swinburne online faq page.

In [2]:
#Webscraper
#import libraries
import requests
from bs4 import BeautifulSoup

#get faq q a pairs from swinburne online faq
def get_faqs():
    #sends http request to swinburne online faq webpage to create Beautiful Soup object
    response = requests.get('https://www.swinburneonline.edu.au/faqs/')
    soup = BeautifulSoup(response.content, 'html.parser')
    #grab all elements of class faqs-group and card and assign them to faq cards.
    faqs_cards = soup.select('.faqs-group .card')
    result = []
    #gets questions and answers from faq cards and places them into result list
    for faq in faqs_cards:
        question = faq.select_one('.card-header h5 > div:nth-child(2)')
        answer = faq.select_one('.card-body .content')
        # add to result if question and answer exist
        if question and answer:
            question = question.get_text(strip=True)
            answer = answer.get_text(strip=True)
            result.append((question, answer))
    return result

## Paraphrase Generator, CSV formatter, and SQUAD adder
    adds webscraped data to faq_data.csv. uses Bart and T5 models to generate 3 iterations of each qa pair and adds them to the file. combines this with a set of around 400 qa pairs from the squad validation dataset.

    

In [7]:
import csv
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM


#get the faq data
faq_data = get_faqs()

#https://huggingface.co/eugenesiow/bart-paraphrase
bartmodel = BartForConditionalGeneration.from_pretrained("eugenesiow/bart-paraphrase")
barttokenizer = BartTokenizer.from_pretrained("eugenesiow/bart-paraphrase")

#https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base?text=What+kind+of+support+do+you+offer+to+your+online+students%3F
device = "cpu"
t5tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
t5model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

#https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality
rammodel = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
ramtokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")

def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=1, num_beams=10):
  # tokenize the text to be form of a list of token IDs
  inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
  # generate the paraphrased sentences
  outputs = model.generate(
    **inputs,
    min_length = 10,
    max_length=150,
    num_beams=num_beams,
    num_return_sequences=num_return_sequences,
  )

  #prints size of generated token collections to keep track of generation number
  print(outputs.size())
  
  # decode the generated sentences using the tokenizer to get them back to text
  return [s.strip() for s in tokenizer.batch_decode(outputs, skip_special_tokens=True)]


#define the csv file path and name
csv_file_path = "faq_data.csv"


#open the csv file for writing
with open(csv_file_path, "w", newline="", encoding="utf-8") as csv_file:
    #create a csv writer object
    csv_writer = csv.writer(csv_file)

    #write the header row
    csv_writer.writerow(["Question", "Answer", "Label"])

    

    #write each faq as a row in the csv file and add label
    for i, faq in enumerate(faq_data):
        question, answer = faq
        #label of 0 corresponds to in scope
        label = "0"
        csv_writer.writerow([question, answer, label])
      
    gencount=0
    #generates and writes paraphrased strings to csv
    for i, faq in enumerate(faq_data):
        question, answer = faq

        #bart repponses
        bart_paraphrased_question = get_paraphrased_sentences(bartmodel, barttokenizer, question, num_beams=10, num_return_sequences=1)[0]
        bart_paraphrased_answer = get_paraphrased_sentences(bartmodel, barttokenizer, answer, num_beams=10, num_return_sequences=1)[0]

        # write the initial paraphrases to the CSV file
        csv_writer.writerow([bart_paraphrased_question, bart_paraphrased_answer, '0'])
        gencount+=2
        print("bart")
        print(gencount)

        #t5 paraphrases
        t5_paraphrased_question = get_paraphrased_sentences(t5model, t5tokenizer, question, num_beams=10, num_return_sequences=1)[0]
        t5_paraphrased_answer = get_paraphrased_sentences(t5model, t5tokenizer, answer, num_beams=10, num_return_sequences=1)[0]
        gencount+=2
        print("t5")
        print(gencount)
        # write the paraphrases to the CSV file
        csv_writer.writerow([t5_paraphrased_question, t5_paraphrased_answer, '0'])

        #ram paraphrases
        ram_paraphrased_question = get_paraphrased_sentences(rammodel, ramtokenizer, question, num_beams=10, num_return_sequences=1)[0]
        ram_paraphrased_answer = get_paraphrased_sentences(rammodel, ramtokenizer, answer, num_beams=10, num_return_sequences=1)[0]
        gencount+=2
        print("ram")
        print(gencount)
         # write the paraphrases to the CSV file. remove lable string attached by ram bot.
        csv_writer.writerow([ram_paraphrased_question.replace("paraphrasedoutput: ", ""), ram_paraphrased_answer.replace("paraphrasedoutput: ", ""), '0'])

    #copies qa pairs from squad
    validation_squad_path = "validation-squad.csv"
    with open(validation_squad_path, "r", encoding="utf-8") as squad:
        csv_reader = csv.reader(squad)
        next(csv_reader)  # skip the header row
    
        # Loop over each row in the validation-squad CSV file
        for row in csv_reader:
           # Get the question-answer pairs from columns 3 and 6
            question = row[2]
            answer = row[5]
        
            # Write the question-answer pair to the existing CSV file with a label of "1"
            csv_writer.writerow([question, answer, "1"])

    
    csv_file.close()




torch.Size([1, 11])
torch.Size([1, 58])
bart
2
torch.Size([1, 11])
torch.Size([1, 51])
t5
4
torch.Size([1, 14])
torch.Size([1, 66])
ram
6
torch.Size([1, 13])
torch.Size([1, 64])
bart
8
torch.Size([1, 18])
torch.Size([1, 70])
t5
10
torch.Size([1, 21])
torch.Size([1, 51])
ram
12
torch.Size([1, 18])
torch.Size([1, 50])
bart
14
torch.Size([1, 21])
torch.Size([1, 46])
t5
16
torch.Size([1, 24])
torch.Size([1, 43])
ram
18
torch.Size([1, 15])
torch.Size([1, 65])
bart
20
torch.Size([1, 12])
torch.Size([1, 68])
t5
22
torch.Size([1, 18])
torch.Size([1, 64])
ram
24
torch.Size([1, 15])
torch.Size([1, 57])
bart
26
torch.Size([1, 23])
torch.Size([1, 52])
t5
28
torch.Size([1, 21])
torch.Size([1, 55])
ram
30
torch.Size([1, 11])
torch.Size([1, 54])
bart
32
torch.Size([1, 16])
torch.Size([1, 95])
t5
34
torch.Size([1, 22])
torch.Size([1, 74])
ram
36
torch.Size([1, 25])
torch.Size([1, 52])
bart
38
torch.Size([1, 31])
torch.Size([1, 54])
t5
40
torch.Size([1, 31])
torch.Size([1, 36])
ram
42
torch.Size([1, 12

## Dataset with Jokes as out of context.

In [3]:
import csv
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM


#get the faq data
faq_data = get_faqs()

#https://huggingface.co/eugenesiow/bart-paraphrase
bartmodel = BartForConditionalGeneration.from_pretrained("eugenesiow/bart-paraphrase")
barttokenizer = BartTokenizer.from_pretrained("eugenesiow/bart-paraphrase")

#https://huggingface.co/humarin/chatgpt_paraphraser_on_T5_base?text=What+kind+of+support+do+you+offer+to+your+online+students%3F
device = "cpu"
t5tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
t5model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

#https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality
rammodel = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")
ramtokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")

def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=1, num_beams=10):
  # tokenize the text to be form of a list of token IDs
  inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
  # generate the paraphrased sentences
  outputs = model.generate(
    **inputs,
    min_length = 10,
    max_length=100,
    num_beams=num_beams,
    num_return_sequences=num_return_sequences,
  )

  #prints size of generated token collections to keep track of generation number
  print(outputs.size())
  
  # decode the generated sentences using the tokenizer to get them back to text
  return [s.strip() for s in tokenizer.batch_decode(outputs, skip_special_tokens=True)]


#define the csv file path and name
csv_file_path = "faq_data.csv"


#open the csv file for writing
with open(csv_file_path, "w", newline="", encoding="utf-8") as csv_file:
    #create a csv writer object
    csv_writer = csv.writer(csv_file)

    #write the header row
    csv_writer.writerow(["Question", "Answer", "Label"])

    

    #write each faq as a row in the csv file and add label
    for i, faq in enumerate(faq_data):
        question, answer = faq
        #label of 0 corresponds to in scope
        label = "0"
        csv_writer.writerow([question, answer, label])
      
    gencount=0
    #generates and writes paraphrased strings to csv
    for i, faq in enumerate(faq_data):
        question, answer = faq

        #bart repponses
        bart_paraphrased_question = get_paraphrased_sentences(bartmodel, barttokenizer, question, num_beams=10, num_return_sequences=1)[0]
        bart_paraphrased_answer = get_paraphrased_sentences(bartmodel, barttokenizer, answer, num_beams=10, num_return_sequences=1)[0]

        # write the initial paraphrases to the CSV file
        csv_writer.writerow([bart_paraphrased_question, bart_paraphrased_answer, '0'])
        gencount+=2
        print("bart")
        print(gencount)

        #t5 paraphrases
        t5_paraphrased_question = get_paraphrased_sentences(t5model, t5tokenizer, question, num_beams=10, num_return_sequences=1)[0]
        t5_paraphrased_answer = get_paraphrased_sentences(t5model, t5tokenizer, answer, num_beams=10, num_return_sequences=1)[0]
        gencount+=2
        print("t5")
        print(gencount)
        # write the paraphrases to the CSV file
        csv_writer.writerow([t5_paraphrased_question, t5_paraphrased_answer, '0'])

        #ram paraphrases
        ram_paraphrased_question = get_paraphrased_sentences(rammodel, ramtokenizer, question, num_beams=10, num_return_sequences=1)[0]
        ram_paraphrased_answer = get_paraphrased_sentences(rammodel, ramtokenizer, answer, num_beams=10, num_return_sequences=1)[0]
        gencount+=2
        print("ram")
        print(gencount)
         # write the paraphrases to the CSV file. remove lable string attached by ram bot.
        csv_writer.writerow([ram_paraphrased_question.replace("paraphrasedoutput: ", ""), ram_paraphrased_answer.replace("paraphrasedoutput: ", ""), '0'])

    #copies qa pairs from squad
    validation_squad_path = "scope_training_datasets/jokes.csv"
    with open(validation_squad_path, "r", encoding="utf-8") as squad:
        csv_reader = csv.reader(squad)
        next(csv_reader)  # skip the header row
    
        # Loop over each row in the validation-squad CSV file
        for row in csv_reader:
           # Get the question-answer pairs from columns 3 and 6
            question = row[0]
            answer = row[1]
        
            # Write the question-answer pair to the existing CSV file with a label of "1"
            csv_writer.writerow([question, answer, "1"])

    
    csv_file.close()



torch.Size([1, 11])
torch.Size([1, 58])
bart
2
torch.Size([1, 11])
torch.Size([1, 51])
t5
4
torch.Size([1, 14])
torch.Size([1, 66])
ram
6
torch.Size([1, 13])
torch.Size([1, 64])
bart
8
torch.Size([1, 18])
torch.Size([1, 70])
t5
10
torch.Size([1, 21])
torch.Size([1, 51])
ram
12
torch.Size([1, 18])
torch.Size([1, 50])
bart
14
torch.Size([1, 21])
torch.Size([1, 46])
t5
16
torch.Size([1, 24])
torch.Size([1, 43])
ram
18
torch.Size([1, 15])
torch.Size([1, 65])
bart
20
torch.Size([1, 12])
torch.Size([1, 68])
t5
22
torch.Size([1, 18])
torch.Size([1, 64])
ram
24
torch.Size([1, 15])
torch.Size([1, 57])
bart
26
torch.Size([1, 23])
torch.Size([1, 52])
t5
28
torch.Size([1, 21])
torch.Size([1, 55])
ram
30
torch.Size([1, 11])
torch.Size([1, 54])
bart
32
torch.Size([1, 16])
torch.Size([1, 100])
t5
34
torch.Size([1, 22])
torch.Size([1, 74])
ram
36
torch.Size([1, 25])
torch.Size([1, 52])
bart
38
torch.Size([1, 31])
torch.Size([1, 54])
t5
40
torch.Size([1, 31])
torch.Size([1, 36])
ram
42
torch.Size([1, 1