In [2]:
from transformers import Trainer, TrainingArguments, DistilBertForMaskedLM, \
    DataCollatorForLanguageModeling, AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Convert to Hugging Face Dataset
faq_dataset = Dataset.from_csv('out-of-scope-dataset.csv')


def tokenize_faq(data):
    result = tokenizer(data["question"], truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Tokenize the dataset
tokenized_faq_dataset = faq_dataset.map(tokenize_faq, batched=True).remove_columns(["question"])

Found cached dataset csv (/Users/phung/.cache/huggingface/datasets/csv/default-78f0e4fe8e0f0d99/0.0.0)
Loading cached processed dataset at /Users/phung/.cache/huggingface/datasets/csv/default-78f0e4fe8e0f0d99/0.0.0/cache-81de64e2750f8f61.arrow


In [21]:
split_dataset = tokenized_faq_dataset.train_test_split(test_size=0.1)
print(split_dataset)
print(split_dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 129123
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 14347
    })
})
{'label': 1, 'input_ids': [101, 2043, 2106, 1996, 8549, 24997, 12868, 5851, 1996, 2157, 2000, 2219, 2455, 1999, 1996, 5797, 3111, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'word_ids': [None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, None]}


In [22]:
id2label = {0: "in-scope", 1: "out-of-scope"}

label2id = {"in-scope": 0, "out-of-scope": 1}

In [29]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

chunk_size = 128


# def group_texts(data):
#     concatenated_examples = {k: sum(data[k], []) for k in data.keys()}
#     total_length = len(concatenated_examples[list(data.keys())[0]])
#     total_length = (total_length // chunk_size) * chunk_size
#     result = {
#         k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
#         for k, t in concatenated_examples.items()
#     }
#     result["labels"] = result["input_ids"].copy()
#     return result
#
#
# lm_datasets = tokenized_faq_dataset.map(group_texts, batched=True)
# split_dataset = tokenized_faq_dataset.train_test_split(test_size=0.1)
# print(split_dataset)

distilbert_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training hyperparameters
training_args = TrainingArguments(
    output_dir="/Volumes/PortableSSD/Projects/models/faqs_distilbert_classifier",
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/Volumes/PortableSSD/Projects/models/faq_distilbert_logs",
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    use_mps_device=True
)

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator,
    # tokenizer=tokenizer,
)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [30]:
# train using torch mps device

trainer.train()




Epoch,Training Loss,Validation Loss


In [16]:
from transformers import AutoModelForSequenceClassification, pipeline
# my_model = AutoModelForSequenceClassification.from_pretrained('/Volumes/PortableSSD/Projects/models/faqs_distilbert_classifier/checkpoint-161000', num_labels=2)

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = my_model(**inputs)

# tokenizer.decode(outputs)
my_pipline = pipeline('text-classification', model='/Volumes/PortableSSD/Projects/models/faqs_distilbert_classifier/checkpoint-161000', tokenizer=tokenizer)
print(my_pipline('What is the student fee?'))


[{'label': 'out-of-scope', 'score': 1.0}]
