In [None]:
%pip install datasets transformers evaluate
%pip install torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.

In [2]:
!pip install accelerate



In [3]:
import pandas as pd
from pathlib import Path
from datasets import load_dataset, ClassLabel, DatasetDict
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer
import numpy as np
import evaluate


In [23]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

QA_PATH = "datasets/truthfulqa_multiple_choice.csv"
MNLI_TRAIN_PATH = "datasets/mnli_train.csv"
MNLI_VAL_PATH = "datasets/mnli_validation_matched.csv"
MNLI_TEST_PATH = "datasets/mnli_test_matched.csv"

QA_CHECKPOINT = "distilbert/distilbert-base-uncased"
MNLI_CHECKPOINT = "distilbert/distilbert-base-uncased"
NUM_CLASSES = 3

In [None]:
# df = pd.read_csv(Path(DATA_PATH))

qa_ds = load_dataset("csv", data_files=QA_PATH)
# ds = ds.rename_columns({"body": "text", "subreddit": "label"})
# ds = ds.class_encode_column("label")

# split into .9 train, .1 test/dev
train_testdev = qa_ds["train"].train_test_split(seed=42, test_size=0.2)
# split .1 test/valid into .05 test, .05 valid
test_dev = train_testdev["test"].train_test_split(seed=42, test_size=0.5)

qa_ds = DatasetDict({
    "train": train_testdev["train"],
    "validation": test_dev["train"],
    "test": test_dev["test"]
})
print(qa_ds)


mnli_train_ds = load_dataset("csv", data_files=MNLI_TRAIN_PATH)
mnli_val_ds = load_dataset("csv", data_files=MNLI_VAL_PATH)
mnli_test_ds = load_dataset("csv", data_files=MNLI_TEST_PATH)

mnli_ds = DatasetDict({
    "train": mnli_train_ds["train"],
    "validation": mnli_val_ds["train"],
    "test": mnli_test_ds["train"]
})

print(mnli_ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'mc1_targets', 'mc2_targets'],
        num_rows: 653
    })
    validation: Dataset({
        features: ['question', 'mc1_targets', 'mc2_targets'],
        num_rows: 82
    })
    test: Dataset({
        features: ['question', 'mc1_targets', 'mc2_targets'],
        num_rows: 82
    })
})
DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
})


In [24]:
current_vals = {
    "checkpoint": MNLI_CHECKPOINT,
    "ds": mnli_ds,
    
}

In [26]:
tokenizer = AutoTokenizer.from_pretrained(MNLI_CHECKPOINT)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)


def compute_metrics(eval_preds):
    metric = evaluate.combine([
        evaluate.load("accuracy"),
        evaluate.load("recall"),
        evaluate.load("precision"),
        evaluate.load("f1"),
    ])
    # metric = evaluate.load("f1")
    # metric = evaluate.load("accuracy", average="weighted")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")


# print(total_dataset)
# print(list(set([title for title in total_dataset["train"]["label"]])))
# print(list(set([title for title in total_dataset["test"]["label"]])))
# print(list(set([title for title in total_dataset["dev"]["label"]])))

# idx2sub = {idx: total_dataset["train"].features["label"].int2str(idx) for idx in range(0,39)}
# sub2idx = {sub: idx for idx, sub in idx2sub.items()}
# print(idx2sub)
# print(sub2idx)



In [16]:
def preprocess_mnli(examples):
    # need to put the two sentences together into a text column
    examples["text"] = [f"{examples['premise'][i]} {examples['hypothesis'][i]}" for i in range(len(examples["premise"]))]
    # Tokenize the texts
    result = tokenize_function(examples)
    # Map the labels to their unique IDs
    # result["label"] = [0 if label == "entailment" else 1 if label == "neutral" else 2 for label in examples["label"]]
    return result

In [20]:
train = current_vals["ds"]["train"]
val = current_vals["ds"]["validation"]
test = current_vals["ds"]["test"]

tokenized_train = train.map(preprocess_mnli, batched=True)
tokenized_train.set_format(
    "pt", columns=["input_ids", "attention_mask"], output_all_columns=True
)
tokenized_val = val.map(preprocess_mnli, batched=True)
tokenized_val.set_format(
    "pt", columns=["input_ids", "attention_mask"], output_all_columns=True
)
tokenized_test = test.map(preprocess_mnli, batched=True)
tokenized_test.set_format(
    "pt", columns=["input_ids", "attention_mask"], output_all_columns=True
)


Map: 100%|██████████| 392702/392702 [01:33<00:00, 4179.64 examples/s]
Map: 100%|██████████| 9815/9815 [00:02<00:00, 4111.36 examples/s]
Map: 100%|██████████| 9796/9796 [00:02<00:00, 4250.70 examples/s]


In [21]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:

mnli_model = AutoModelForSequenceClassification.from_pretrained(
    current_vals["checkpoint"], num_labels=NUM_CLASSES
).to(device)


Some weights of the model checkpoint at distilbert/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifie

In [30]:
# TODO: Hyperparameter finetuning
training_args = TrainingArguments(
    output_dir="mnli_models",
    report_to="none",
    # eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    # metric_for_best_model="accuracy",
    # load_best_model_at_end=True,
    learning_rate=2e-5,
    # fp16=True,
)

In [None]:
trainer = Trainer(
    mnli_model,
    training_args,
    train_dataset=tokenized_train, #.shuffle(seed=42).select(range(1000)),
    eval_dataset=tokenized_val, #.shuffle(seed=42).select(range(1000)),
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

TypeError: Trainer.__init__() got an unexpected keyword argument 'processing_class'

: 

In [10]:
!nvidia-smi


Tue Dec 10 00:07:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              48W / 400W |    891MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [68]:
torch.cuda.empty_cache()
model.to("cpu")
torch.cuda.empty_cache()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [69]:
trainer.train()

Epoch,Training Loss,Validation Loss,Recall,Precision,F1
1,0.9941,2.124421,0.49686,0.503007,0.497751
2,1.4547,1.812721,0.52194,0.519596,0.518553
3,1.2879,1.859589,0.52256,0.521503,0.520926


TrainOutput(global_step=42189, training_loss=1.2512588661922808, metrics={'train_runtime': 7470.4943, 'train_samples_per_second': 361.422, 'train_steps_per_second': 5.647, 'total_flos': 5.024751329093299e+17, 'train_loss': 1.2512588661922808, 'epoch': 3.0})

In [70]:
trainer.evaluate() # using the dev set

{'eval_loss': 1.8127211332321167,
 'eval_recall': 0.52194,
 'eval_precision': 0.5195962262665735,
 'eval_f1': 0.5185528316521014,
 'eval_runtime': 52.1212,
 'eval_samples_per_second': 959.302,
 'eval_steps_per_second': 15.003,
 'epoch': 3.0}

In [None]:
# TODO: evaluate final performance on test set
# predictions = trainer.predict(tokenized_test)