# Collect Data

In [None]:
from utils import swinburne_utils

# Load and preprocess the scraped FAQs
faq_data = swinburne_utils.get_faqs()
faq_dataset = [{"question": q, "answer": a} for q, a in faq_data]

In [None]:
# Printing for debugging purposes
print(faq_dataset)

In [None]:
from transformers import Trainer, TrainingArguments, DistilBertForMaskedLM, \
    DataCollatorForLanguageModeling, AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Convert to Hugging Face Dataset
faq_dataset = Dataset.from_dict({"text": [f"Question: {d['question']} Answer: {d['answer']}" for d in faq_dataset]})


def tokenize_faq(data):
    result = tokenizer(data["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Tokenize the dataset
tokenized_faq_dataset = faq_dataset.map(tokenize_faq, batched=True).remove_columns(["text"])
print(tokenized_faq_dataset)

chunk_size = 128


def group_texts(data):
    concatenated_examples = {k: sum(data[k], []) for k in data.keys()}
    total_length = len(concatenated_examples[list(data.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_faq_dataset.map(group_texts, batched=True)
split_dataset = tokenized_faq_dataset.train_test_split(test_size=0.1)
print(split_dataset)

distilbert_model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Training hyperparameters
training_args = TrainingArguments(
    output_dir="./faq_distilbert",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./faq_distilbert_logs",
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
)

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:

# Fine-tuning
trainer.train()


In [1]:
import csv
from datasets import Dataset

dataset_dict = {"text": []}

with open('./questions_answers_swinburne_monash.csv', "r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        question = row["question"]
        answer = row["answer"]

        text = f"Question: {question} Answer: {answer}"
        dataset_dict["text"].append(text)

faq_dataset = Dataset.from_dict(dataset_dict)

print(faq_dataset)


Dataset({
    features: ['text'],
    num_rows: 1278
})


In [2]:
print(faq_dataset[0])


{'text': 'Question: What support can I expect? Answer: As a Swinburne Online student, you’ll have support for extended hours, seven days a week, with Student Advisors available to help with anything from tech support to research advice and dedicated online tutors in each of your units. Learn more about your support .'}
