In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5,6,7,0,1,2"

In [2]:
import torch

In [3]:
model = torch.load('bert-base-uncased_mrpc_1.0_completeModel.pth')

In [4]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=128, bias=True)
              (key): Linear(in_features=768, out_features=128, bias=True)
              (value): Linear(in_features=768, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [5]:
batch_size = 2
task = 'mrpc'
from datasets import load_dataset, load_metric, load_from_disk
actual_task = "mnli" if task == "mnli-mm" else task


from transformers import AutoTokenizer
import torch
metric = load_metric('glue', actual_task)

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}



from transformers import AutoTokenizer, DataCollatorWithPadding


dataset_dir = '../dataset/' + task
##### Dataset loading from folder or downloading
print("Raw Dataset Loading")
if dataset_dir is not None:
    print("Loading dataset from folder")
    raw_datasets = load_dataset("glue", actual_task)
else:
    raw_datasets = load_from_disk(actual_task)




checkpoint = "bert-base-uncased"


tokenizer_dir = '../checkpoint/' + task
##### Tokenizer Loading from folder or downloading 
if tokenizer_dir is not None:
    print("Loading Tokenizer from folder")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
else:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print("Tokenizer Loaded")


sentence1_key, sentence2_key = task_to_keys[task]
def tokenize_function(examples):    
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], padding='max_length',truncation=True, max_length=512, 
                        return_tensors="pt")
    return tokenizer(examples[sentence1_key], examples[sentence2_key], padding='max_length',truncation=True, max_length=512, 
                         return_tensors="pt")


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label', "token_type_ids"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=512)
print(tokenized_datasets)

train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

  metric = load_metric('glue', actual_task)
2024-04-06 09:37:09.169841: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Raw Dataset Loading
Loading dataset from folder
Loading Tokenizer from folder
Tokenizer Loaded
DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [6]:
 print("Loading Training Arguments and Trainer")
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer", per_device_train_batch_size=1, 
                                  gradient_accumulation_steps=8, 
                                  gradient_checkpointing=True, 
                                  per_device_eval_batch_size=8, 
                                  evaluation_strategy="epoch",
                                    num_train_epochs=3,              # total number of training epochs
                                    metric_for_best_model='accuracy',
                                    )

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



Loading Training Arguments and Trainer


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [7]:
trainer.model.get_memory_footprint()

199575372

In [9]:
print("Evaluating before fine-tuning")
result = trainer.evaluate()
print(result)

Evaluating before fine-tuning


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer.model.bert.encoder.layer[0].attention.forward