# Providing Custom Inputs to BertClassifier

In [50]:
# import argparse
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from transformers.trainer_utils import set_seed
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from utils import *

import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
import transformers.trainer
from transformers.trainer import *
from transformers.modeling_outputs import SequenceClassifierOutput

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [51]:
def extractEuphIdx(tokenizer, input):
    """
    input is list of numbers
    """
    start_euph_idx = len(tokenizer)-2
    start_idx = (input==start_euph_idx).nonzero().squeeze()
    end_idx = (input==start_euph_idx+1).nonzero().squeeze()
    euph_idx = [idx for idx in range(start_idx+1, end_idx)]
    return euph_idx

class PET_layer(nn.Module):
    def __init__(self, tokenizer, pet_dim, device):
        super(PET_layer, self).__init__()
        self.tokenizer = tokenizer
        self.pet_dim = pet_dim
        self.device = device
        self.linear1 = nn.Linear(pet_dim, pet_dim)
        self.dropout = nn.Dropout(0.1)
        self.linear2 = nn.Linear(pet_dim, 2)

    def forward(self, inputs, input_ids):
        euph_tensor = torch.zeros([inputs.shape[0], inputs.shape[-1]]).to(self.device)
        for i in range(input_ids.shape[0]):
            idxs = extractEuphIdx(self.tokenizer, input_ids[i])
            for j in idxs:
                euph_tensor[i] += inputs[i][j]
        out = self.linear2(self.dropout(self.linear1(euph_tensor)))
        return out

## Data Prep

In [58]:
train_path = 'shared_task_train.csv'
test_path = 'shared_task_test.csv'
df_train = pd.read_csv(train_path).drop(['index'], axis=1)
# df_val = pd.read_csv(args.valid_path).drop(['index'], axis=1)
df_test = pd.read_csv(test_path).drop(['index'], axis=1)

# Remove the @@@ stuff
df_train['utterance'] = df_train.apply(lambda row: clean(row['utterance']), axis=1)
# df_val['utterance'] = df_val.apply(lambda row: clean(row['utterance']), axis=1)
df_test['utterance'] = df_test.apply(lambda row: clean(row['utterance']), axis=1)

df_train['utterance'] = df_train.apply(lambda row : row['utterance'].replace("<", "[START_EUPH] ").replace(">", " [END_EUPH]"), axis=1)
# df_val['utterance'] = df_val.apply(lambda row : row['utterance'].replace("<", "[START_EUPH] ").replace(">", " [END_EUPH]"), axis=1)
df_test['utterance'] = df_test.apply(lambda row : row['utterance'].replace("<", "[START_EUPH] ").replace(">", " [END_EUPH]"), axis=1)

df_train.to_csv("shared_task_train_processed.csv")
df_test.to_csv("shared_task_test_processed.csv")

In [60]:
# data_files = {'train': 'train.csv', 'test': 'test.csv'}
# ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
train_dataset = load_dataset('csv', data_files='shared_task_train_processed.csv')
test_dataset = load_dataset('csv', data_files='shared_task_test_processed.csv')

# train_dataset = Dataset.from_pandas(df_train)
# # val_dataset = Dataset.from_pandas(df_val)
# test_dataset = Dataset.from_pandas(df_test)

Using custom data configuration default-2a13dedf8230a926


Downloading and preparing dataset csv/default to /home/leep/.cache/huggingface/datasets/csv/default-2a13dedf8230a926/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/leep/.cache/huggingface/datasets/csv/default-2a13dedf8230a926/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-5294fd14ceb3fac4


Downloading and preparing dataset csv/default to /home/leep/.cache/huggingface/datasets/csv/default-5294fd14ceb3fac4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/leep/.cache/huggingface/datasets/csv/default-5294fd14ceb3fac4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

## Model Prep

In [61]:
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
special_tokens_dict = {'additional_special_tokens': ['[START_EUPH]','[END_EUPH]']}
tokenizer.add_special_tokens(special_tokens_dict)
    
train_tokenized = train_dataset.map(lambda batch: tokenizer(batch['utterance'], max_length=256, padding="max_length", truncation=True), batched=True, load_from_cache_file=False)
# val_tokenized = val_dataset.map(lambda batch: tokenizer(batch['utterance'], max_length=args.max_length, padding="max_length", truncation=True), batched=True, load_from_cache_file=False)
test_tokenized = test_dataset.map(lambda batch: tokenizer(batch['utterance'], max_length=256, padding="max_length", truncation=True), batched=True, load_from_cache_file=False)

trainer_args = TrainingArguments(
    output_dir = "RoBERTa-large-PET",
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    # save_strategy = 'NO',
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    num_train_epochs = 10,
    seed = 111,
    load_best_model_at_end=True,
    learning_rate = 5e-6
)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /home/leep/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/leep/.cache/huggi

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [62]:
def model_init():
    model = AutoModel.from_pretrained("roberta-large")
    model.resize_token_embeddings(len(tokenizer))
    model.pooler = nn.Identity()
    model.pet = PET_layer(tokenizer, 1024, device)
    # if args.model_type == "cls":
    #     model.cls_layer = CLS_Layer(args.pet_dim, device)
    # elif args.model_type == "pet":
    #     model.pooler = nn.Identity()
    #     model.pet = PET_layer(tokenizer, args.pet_dim, device)
    # elif args.model_type == "dan":
    #     model.pooler = nn.Identity()
    #     model.pet = Sent_DAN_Simple(tokenizer, args.pet_dim, device)
    # else:
    #     raise NotImplementedError
    return model

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [63]:
# the way it's currently set up (I think because of load_dataset, which is required for mapping during preprocessing), 
# the actual tokenized text entries required by trainer are accessed not in the dataset itself but by the 'train' key IN the dataset
# to specify to trainer, set these variables equal to the text...
train_tokenized = train_tokenized['train']
test_tokenized = test_tokenized['train']

In [64]:
class MyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        last_hidden_state = outputs['last_hidden_state']
        try:
            logits = model.pet(last_hidden_state, inputs['input_ids'])
        except:
            try:
                logits = model.module.pet(last_hidden_state, inputs['input_ids'])
            except:
                logits = model.cls_layer(outputs['pooler_output'])
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, inputs['labels'])
        outputs = SequenceClassifierOutput(loss=loss, logits=logits)

        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss

    def prediction_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        prediction_loss_only: bool,
        ignore_keys: Optional[List[str]] = None,
    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
        """
        Perform an evaluation step on `model` using `inputs`.
        Subclass and override to inject custom behavior.
        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
            ignore_keys (`Lst[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
        Return:
            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        """
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)
        if ignore_keys is None:
            if hasattr(self.model, "config"):
                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
            else:
                ignore_keys = []

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None

        with torch.no_grad():
            if is_sagemaker_mp_enabled():
                raw_outputs = smp_forward_only(model, inputs)
                if has_labels:
                    if isinstance(raw_outputs, dict):
                        loss_mb = raw_outputs["loss"]
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        loss_mb = raw_outputs[0]
                        logits_mb = raw_outputs[1:]

                    loss = loss_mb.reduce_mean().detach().cpu()
                    logits = smp_nested_concat(logits_mb)
                else:
                    loss = None
                    if isinstance(raw_outputs, dict):
                        logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys)
                    else:
                        logits_mb = raw_outputs
                    logits = smp_nested_concat(logits_mb)
            else:
                if has_labels:
                    with self.compute_loss_context_manager():
                        loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
                    loss = loss.mean().detach()

                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
                    else:
                        logits = outputs[1:]
                else:
                    loss = None
                    with self.compute_loss_context_manager():
                        outputs = model(**inputs)
                    if isinstance(outputs, dict):
                        logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                    else:
                        logits = outputs
                    # TODO: this needs to be fixed and made cleaner later.
                    if self.args.past_index >= 0:
                        self._past = outputs[self.args.past_index - 1]

        if prediction_loss_only:
            return (loss, None, None)

        logits = nested_detach(logits)
        if len(logits) == 1:
            logits = logits[0]

        labels = inputs['labels']
        return (loss, logits, labels)

In [65]:
trainer = MyTrainer(
    model_init=model_init,
    args=trainer_args,
    train_dataset=train_tokenized,
    # eval_dataset=val_tokenized,
    eval_dataset = test_tokenized,
    compute_metrics=compute_metrics,
)

trainer.train()

loading configuration file config.json from cache at /home/leep/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file pytorch_model.bin from cache at /home/leep/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.353342,0.860051,0.83039,0.841276,0.835433
2,No log,0.313288,0.877863,0.857533,0.846572,0.851726
3,No log,0.356821,0.872774,0.846961,0.850334,0.848613
4,No log,0.41034,0.877863,0.850687,0.863805,0.856709
5,No log,0.516579,0.877863,0.852987,0.85642,0.854669
6,0.190800,0.568931,0.882952,0.862437,0.855119,0.858633
7,0.190800,0.674428,0.888041,0.861746,0.878437,0.869246
8,0.190800,0.692524,0.885496,0.859845,0.871702,0.865355
9,0.190800,0.709454,0.890585,0.866584,0.875325,0.870737
10,0.190800,0.724477,0.890585,0.866584,0.875325,0.870737


The following columns in the evaluation set don't have a corresponding argument in `RobertaModel.forward` and have been ignored: utterance, Unnamed: 0. If utterance, Unnamed: 0 are not expected by `RobertaModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 393
  Batch size = 16
Saving model checkpoint to RoBERTa-large-PET/checkpoint-99
Configuration saved in RoBERTa-large-PET/checkpoint-99/config.json
Model weights saved in RoBERTa-large-PET/checkpoint-99/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaModel.forward` and have been ignored: utterance, Unnamed: 0. If utterance, Unnamed: 0 are not expected by `RobertaModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 393
  Batch size = 16
Saving model checkpoint to RoBERTa-large-PET/checkpoint-198
Configuration saved in RoBERTa-large-PET/checkpoint-198/config.json
Model weights sav

TrainOutput(global_step=990, training_loss=0.10312627590063847, metrics={'train_runtime': 999.6714, 'train_samples_per_second': 15.725, 'train_steps_per_second': 0.99, 'total_flos': 7324980515758080.0, 'train_loss': 0.10312627590063847, 'epoch': 10.0})