In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).to(device).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [17]:
# text = 'What kind of support do you offer to your online students?'
# print(paraphrase(text))

In [3]:
from utils import swinburne_utils

# Load and preprocess the scraped FAQs
faq_data = swinburne_utils.get_faqs()
faq_dataset = [{"question": q, "answer": a} for q, a in faq_data]

In [5]:

for question_answer in faq_dataset:
    question_answer["questions_set"] = paraphrase(question_answer["question"])
    question_answer["answers_set"] = paraphrase(question_answer["answer"])
    print('-'*100)
    print('Question:', question_answer["question"])
    print('Answer:', question_answer["answer"])
    print('-'*100)
    for q, a in zip(question_answer["questions_set"], question_answer["answers_set"]):
        print('Paraphrased Q:', q)
        print('Paraphrased A:', a)
        print('-'*100)

print(faq_dataset[0])

----------------------------------------------------------------------------------------------------
Question: What support can I expect?
Answer: As a Swinburne Online student, you’ll have support for extended hours, seven days a week, with Student Advisors available to help with anything from tech support to research advice and dedicated online tutors in each of your units. Learn more aboutyour support.
----------------------------------------------------------------------------------------------------
Paraphrased Q: What aid can I receive?
Paraphrased A: Swinburne Online students have access to Student Advisors who are available for extended hours, providing all the necessary support they need from tech support to research advice and dedicated online tutors in each unit. Learn more about this.
----------------------------------------------------------------------------------------------------
Paraphrased Q: Which assistance will be given to me?
Paraphrased A: Whether you are a Swinbu

# Generate CSV from the resulting dataset
You can run this to see a sample of the generated data.
The next script will generate the paraphrased dataset and save it to a CSV file named `swinburne_paraphrased_faq.csv`.
I have added a column `base_pair` to indicate the original question-answer pair.


In [9]:
import csv

with open('swinburne_paraphrased_faq.csv', 'w', newline='') as csvfile:
    fieldnames = ['question', 'answer', 'base_pair']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for question_answer in faq_dataset:
        writer.writerow({
            'question': question_answer["question"],
            'answer': question_answer["answer"],
            'base_pair': 'X'
        })
        for q, a in zip(question_answer["questions_set"], question_answer["answers_set"]):
            writer.writerow({
                'question': q,
                'answer': a,
                'base_pair': ''
            })

