In [1]:
## Import packages
import torch
import time
import wandb
import numpy as np
from tqdm import tqdm

import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification

In [2]:
wandb.init(project='first wandb trial', name='Full Data run 1')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohitydv09[0m ([33mmohitydv09-university-of-minnesota5275[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)

Device:  cuda


In [4]:
## Load the dataset
dataset = load_dataset("yaful/MAGE")

## Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', clean_up_tokenization_spaces=True)

## Apply the tokenizer to the dataset.
dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True), batched=True)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'src', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 319071
    })
    validation: Dataset({
        features: ['text', 'label', 'src', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 56792
    })
    test: Dataset({
        features: ['text', 'label', 'src', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 60743
    })
})

In [5]:
# ## Create subset of data for testing.
# from datasets import DatasetDict
# # Select the first 10000 samples from each dataset
# subset_train = dataset['train'].select(range(1000))
# subset_validation = dataset['validation'].select(range(500))
# subset_test = dataset['test'].select(range(500))

# # Combine them back into a DatasetDict
# dataset = DatasetDict({
#     'train': subset_train,
#     'validation': subset_validation,
#     'test': subset_test
# })
# dataset


In [6]:
## Create batch of data using DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Define Evaluation function
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
## Create the model.
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the Model.

In [8]:
## Create a trainer class.
class CustomTrainer(Trainer):
    def _inner_training_loop(
            self,
            batch_size = None, 
            args = None,
            resume_from_checkpoint = None,
            trial = None,
            ignore_keys_for_eval = None
    ):
        number_of_epochs = args.num_train_epochs
        start_time = time.time()
        
        train_loss = []
        train_acc = []
        eval_acc = []

        criterion = torch.nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.9)

        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        max_steps = len(train_dataloader) * number_of_epochs

        for epoch in range(number_of_epochs):
            train_loss_per_epoch = 0
            train_acc_per_epoch = 0
            with tqdm(train_dataloader, unit = 'batch') as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for step, inputs in enumerate(training_epoch):
                    inputs = inputs.to(device)
                    labels = inputs['labels']

                    ## Forward pass
                    self.optimizer.zero_grad()
                    model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

                    ## Compute the loss
                    loss = criterion(model_outputs['logits'], labels)
                    train_loss_per_epoch += loss.item()

                    wandb.log({"Train Loss per epoch":loss.item(), "Step":step, "Epoch":epoch})

                    ## Calculate gradients
                    loss.backward()
                    ## Update weights
                    self.optimizer.step()
                    train_acc_per_epoch += (model_outputs['logits'].argmax(1) == labels).sum().item()
            
            wandb.log({"Train Loss":train_loss_per_epoch/len(train_dataloader),
                       "Train Accuracy":train_acc_per_epoch/(len(train_dataloader)*batch_size),
                       "Epoch":epoch})

            ## Change the learning rate.
            self.scheduler.step()
            train_loss_per_epoch /= len(train_dataloader)
            train_acc_per_epoch /= (len(train_dataloader) * batch_size)

            eval_loss_per_epoch = 0
            eval_acc_per_epoch = 0
            with tqdm(eval_dataloader, unit='batch') as eval_epoch:
                eval_epoch.set_description(f"Evaluation Epoch {epoch}")
                with torch.no_grad():
                    for step, inputs in enumerate(eval_epoch):
                        inputs = inputs.to(device)
                        labels = inputs['labels']

                        ## Foward pass
                        model_outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
                        
                        ## Compute loss
                        loss = criterion(model_outputs['logits'], labels)
                        eval_loss_per_epoch += loss.item()

                        ## Compute accuracy
                        eval_acc_per_epoch += (model_outputs['logits'].argmax(1) == labels).sum().item()
            
            wandb.log({"Eval Loss": eval_loss_per_epoch / len(eval_dataloader), 
                       "Eval Accuracy": eval_acc_per_epoch / (len(eval_dataloader) * batch_size),
                       "Epoch": epoch})
            
            eval_loss_per_epoch /= len(eval_dataloader)
            eval_acc_per_epoch /= (len(eval_dataloader) * batch_size)

            train_loss.append(train_loss_per_epoch)
            train_acc.append(train_acc_per_epoch)
            eval_acc.append(eval_acc_per_epoch)

            print(f'\tTrain Loss: {train_loss_per_epoch :.3f} | Train Acc: {train_acc_per_epoch*100:.2f}%')
            print(f'\tEval Loss: {eval_loss_per_epoch :.3f} | Eval Acc: {eval_acc_per_epoch*100:.2f}%')
        print(f'Time: {(time.time()-start_time)/60:.3f} minutes ')

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
)

trainer = CustomTrainer(
    model=model,
    args = training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Note:
Use DistillBert and Increase batch size to 64, Think about graphs a bit.
use the [CLS],[SEP] token in the model for classification task.


In [10]:
trainer.train()
wandb.finish()

Training Epoch 0:   7%|▋         | 738/9971 [07:39<1:35:46,  1.61batch/s]


KeyboardInterrupt: 