In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).to(device).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Original mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Subword indices of matching word tensor([[1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])
Merged mask tensor([[1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])


In [2]:
# text = 'What kind of support do you offer to your online students?'
# print(paraphrase(text))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
from utils import faqs

# Load Swinburne and Monash FAQs
# faq_data = faqs.get_swinburne_faqs() + faqs.get_monash_faqs() + faqs.get_rmit_faqs()
faq_data = faqs.get_swinburne_faqs()
# Filter out questions and answers that explicitly mention Monash or monash and RMIT or rmit
# faq_data = [(q, a) for q, a in faq_data if "monash" not in q.lower() and "rmit" not in q.lower() and "monash" not in a.lower() and "rmit" not in a.lower()]
print(len(faq_data))
faq_dataset = [{"question": q, "answer": a} for q, a in faq_data]

171


In [5]:
for question_answer in faq_dataset:
    question_answer["questions_set"] = paraphrase(question_answer["question"])
    question_answer["answers_set"] = paraphrase(question_answer["answer"])

213


In [8]:
print(len(faq_dataset))
print(faq_dataset[0])

213
{'question': 'What support can I expect?', 'answer': 'As a Swinburne Online student, you’ll have support for extended hours, seven days a week, with Student Advisors available to help with anything from tech support to research advice and dedicated online tutors in each of your units. Learn more about your support .', 'questions_set': ['What aid can I receive?', 'Which assistance will be given to me?', 'What help I can get?', 'What kind of assistance can I receive?', 'what support I will be given?'], 'answers_set': ['Swinburne Online students have access to Student Advisors who are available for extended hours, seven days a week, offering various services from tech support to research advice and dedicated online tutors in each unit. Learn more about this service.', 'Swinburne Online students can count on Student Advisors to provide 24-hour support, which includes online tutoring in every unit and technical guidance for students. Learn more about this service.', 'Swinburne Online st

# Generate CSV from the resulting dataset
You can run this to see a sample of the generated data.
The next script will generate the paraphrased dataset and save it to a CSV file named `swinburne_paraphrased_faq.csv`.
I have added a column `base_pair` to indicate the original question-answer pair.


In [4]:
import csv

with open('questions_answers_swinburne_only.csv', 'w', newline='') as csvfile:
    fieldnames = ['prompt', 'completion']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for question_answer in faq_dataset:
        writer.writerow({
            'prompt': question_answer["question"],
            'completion': question_answer["answer"],
        })



In [4]:
import csv
with open('questions_answers_swinburne_monash.csv', 'r') as read_obj, \
        open('questions_answers_swinburne_only_monash_new_for_openai.csv', 'w', newline='') as write_obj:
    # select first 2 column in the read_obj and write to write_obj with different headers
    csv_reader = csv.reader(read_obj)
    csv_writer = csv.writer(write_obj)
    csv_writer.writerow(['prompt', 'completion'])
    # skip the header row
    next(csv_reader)
    for row in csv_reader:
        # if row[2] is X then skip
        if row[2] == 'X':
            continue
        csv_writer.writerow(row)
