In [1]:
from datasets import load_dataset
import csv

squad_dataset = load_dataset('squad_v2')
# Get context and question from dataset
squad_contexts = [x['context'] for x in squad_dataset['train']] + [x['context'] for x in squad_dataset['validation']]
squad_questions = [x['question'] for x in squad_dataset['train']] + [x['question'] for x in squad_dataset['validation']]

with open('questions_answers_swinburne_monash.csv') as csv_file:
    with open('out-of-scope-dataset.csv', 'w', newline='') as new_csv_file:
        fieldnames = ['statement', 'label']
        writer = csv.DictWriter(new_csv_file, fieldnames=fieldnames)
        writer.writeheader()
        # 0 for in-scope, 1 for out-of-scope
        reader = csv.DictReader(csv_file)
        for row in reader:
            question = row['question']
            answer = row['answer']
            label = 0
            writer.writerow({'statement': question, 'label': 0})
            writer.writerow({'statement': answer, 'label': 1})
        for question in squad_questions:
            label = 1
            writer.writerow({'statement': question, 'label': label})



Found cached dataset squad_v2 (/home/phung/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:

import torch
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', cache_dir="/mnt/external-ssd/cache_dir")

# Convert to Hugging Face Dataset
faq_dataset = Dataset.from_csv('out-of-scope-dataset.csv')

Found cached dataset csv (/home/phung/.cache/huggingface/datasets/csv/default-0ba080e82ee130e3/0.0.0)


In [3]:

def tokenize_faq(data):
    word = "Swinburne"
    sentences = data["statement"]
    # Get the index of the word in each sentence (if it exists)
    word_idxs_in_sent = []
    for sent in sentences:
        try:
            idx = sent.split(" ").index(word)
        except ValueError:
            # Word not found in sentence
            idx = -1
        word_idxs_in_sent.append(idx)
    # Index of word in the sentences (word-tokenized!)
    word_idxs_in_sent = [idx if idx != -1 else None for idx in word_idxs_in_sent]
    encoded = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # For each sentence, set a subword token to False if it belongs to the word (becomes 0 in LongTensor)
    # match_idxs = torch.LongTensor([[wid != word_idxs_in_sent[batch_idx] for wid in encoded.word_ids(batch_idx)]
                  # for batch_idx in range(len(sentences))])
    attention_mask = encoded["attention_mask"]
    # Build a custom mask that zeroes out the subword tokens corresponding to the word
    custom_mask = torch.LongTensor([[wid != idx if idx is not None else True for wid in encoded.word_ids(batch_idx)]
                  for batch_idx, idx in enumerate(word_idxs_in_sent)])

    # Merge the custom mask with the original attention mask
    encoded["attention_mask"] = torch.where(custom_mask == 0, custom_mask, attention_mask)

    # print("Original mask", attention_mask)
    # print("Custom mask", custom_mask)
    # print("Merged mask", encoded["attention_mask"])

    # Merge: if a word is zero in our custom match, merge, if not, use the original mask
    # This ensures that we mask the word IDs but keep the original mask for special tokens (cls, pad, etc.)
    # encoded["attention_mask"] = torch.where(match_idxs == 0, match_idxs, encoded["attention_mask"])
    if tokenizer.is_fast:
        encoded["word_ids"] = [encoded.word_ids(i) for i in range(len(encoded["input_ids"]))]
    return encoded


# Tokenize the dataset
tokenized_faq_dataset = faq_dataset.map(tokenize_faq, batched=True)
tokenized_faq_dataset_without_statements = tokenized_faq_dataset.remove_columns("statement")

Map:   0%|          | 0/144748 [00:00<?, ? examples/s]

In [4]:
split_dataset = tokenized_faq_dataset_without_statements.train_test_split(test_size=0.1)
print(split_dataset)
print(tokenized_faq_dataset[0])
print(tokenized_faq_dataset[1])


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 130273
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 14475
    })
})
{'statement': 'What support can I expect?', 'label': 0, 'input_ids': [101, 2054, 2490, 2064, 1045, 5987, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask

In [5]:
id2label = {0: "in-scope", 1: "out-of-scope"}

label2id = {"in-scope": 0, "out-of-scope": 1}

In [13]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

chunk_size = 128


# def group_texts(data):
#     concatenated_examples = {k: sum(data[k], []) for k in data.keys()}
#     total_length = len(concatenated_examples[list(data.keys())[0]])
#     total_length = (total_length // chunk_size) * chunk_size
#     result = {
#         k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
#         for k, t in concatenated_examples.items()
#     }
#     result["labels"] = result["input_ids"].copy()
#     return result
#
#
# lm_datasets = tokenized_faq_dataset.map(group_texts, batched=True)
# split_dataset = tokenized_faq_dataset.train_test_split(test_size=0.1)
# print(split_dataset)

distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
).to('cuda')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training hyperparameters
training_args = TrainingArguments(
    output_dir="/mnt/external-ssd/Projects/models/faq_distilbert",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_total_limit=10,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


In [0]:
# train using torch mps device

trainer.train()

In [14]:
import evaluate

metric = evaluate.load("glue", "mrpc", cache_dir="/mnt/external-ssd/cache_dir")
metric.compute(predictions=trainer.predict(split_dataset["test"]).predictions, references=split_dataset["test"]["labels"])

In [4]:
from transformers import AutoModelForSequenceClassification, pipeline
# my_model = AutoModelForSequenceClassification.from_pretrained('/Volumes/PortableSSD/Projects/models/faqs_distilbert_classifier/checkpoint-161000', num_labels=2)

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = my_model(**inputs)

# tokenizer.decode(outputs)
my_pipline = pipeline('text-classification', model='/mnt/external-ssd/Projects/models/faq_distilbert/checkpoint-40500', tokenizer=tokenizer)
print(my_pipline('Swinburne'))


[{'label': 'in-scope', 'score': 0.999913215637207}]
