In [2]:
import requests
from bs4 import BeautifulSoup


def get_faqs():
    response = requests.get('https://www.swinburneonline.edu.au/faqs/')
    soup = BeautifulSoup(response.content, 'html.parser')
    # get faq cards
    faqs_cards = soup.select('.faqs-group .card')
    result = []
    # loop through and get questions and answers
    for faq in faqs_cards:
        question_el = faq.select_one('.card-header h5 > div:nth-child(2)')
        answer_el = faq.select_one('.card-body .content')
        # add to result if question and answer exist
        if question_el and answer_el:
            question = question_el.get_text(strip=True)
            answer = answer_el.get_text(strip=True)
            result.append((question, answer))
    return result

In [27]:
from transformers import Trainer, TrainingArguments, DistilBertForMaskedLM, \
    DataCollatorForLanguageModeling, AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Load and preprocess the scraped FAQs
faq_data = get_faqs()
faq_dataset = [{"question": q, "answer": a} for q, a in faq_data]

# Convert to Hugging Face Dataset
faq_dataset = Dataset.from_dict({"text": [f"{d['question']} {d['answer']}" for d in faq_dataset]})
print(faq_dataset)

def tokenize_faq(data):
    result = tokenizer(data["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Tokenize the dataset
tokenized_faq_dataset = faq_dataset.map(tokenize_faq, batched=True).remove_columns(["text"])
print(tokenized_faq_dataset)

chunk_size = 128


def group_texts(data):
    concatenated_examples = {k: sum(data[k], []) for k in data.keys()}
    total_length = len(concatenated_examples[list(data.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_faq_dataset.map(group_texts, batched=True)
split_dataset = tokenized_faq_dataset.train_test_split(test_size=0.1)
print(split_dataset)

distilbert_model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Training hyperparameters
training_args = TrainingArguments(
    output_dir="./faq_distilbert",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./faq_distilbert_logs",
    logging_steps=10,
    save_steps=50,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
)

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Dataset({
    features: ['text'],
    num_rows: 171
})


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids'],
    num_rows: 171
})


Map:   0%|          | 0/171 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 153
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 18
    })
})


In [None]:

# Fine-tuning
trainer.train()


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
