In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import wandb

# Initialize wandb
wandb.init(
    project="bert-crossencoder-classification"
)

# Load dataset
dataset = load_dataset("minoosh/EPITOME_pairs")

# Initialize the tokenizer and model for cross-encoder setup
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]
def preprocess_function(examples):
    # Concatenate both texts with a [SEP] token in between
    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)
    encodings['labels'] = examples['label']  # Add labels
    return encodings

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define compute_metrics function for classification evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    f1 = f1_score(labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Custom Cross-Encoder model class for classification
class CrossEncoderModel(torch.nn.Module):
    def __init__(self, model_name, num_classes=4, loss_fn="cross_entropy"):
        super(CrossEncoderModel, self).__init__()
        # Load model config
        self.config = AutoConfig.from_pretrained(model_name, num_labels=num_classes)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)
        self.loss_fn = loss_fn

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Output logits for classification

        loss = None
        if labels is not None:
            if self.loss_fn == "cross_entropy":
                loss_fct = torch.nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification
                loss = loss_fct(logits, labels)
            elif self.loss_fn == "focal_loss":
                # Focal loss implementation for handling class imbalance
                alpha = 0.25
                gamma = 2.0
                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)
                pt = torch.exp(-ce_loss)  # Probability of the true class
                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()
            elif self.loss_fn == "kl_divergence":
                # KL Divergence for soft-label classification
                kl_div = torch.nn.KLDivLoss(reduction="batchmean")
                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.config.num_labels).float()
                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                loss = kl_div(log_probs, soft_labels)
            else:
                raise ValueError(f"Unsupported loss function: {self.loss_fn}")

        return {"loss": loss, "logits": logits}

# Function to initialize and train the cross-encoder model
def train_crossencoder(loss_fn):
    model = CrossEncoderModel(model_name=model_name, loss_fn=loss_fn)

    # Set up TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/empathy-crossencoder-{loss_fn}",
        evaluation_strategy="epoch",
        logging_dir='./logs',
        logging_steps=10,
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    trainer.evaluate(tokenized_test)

    # Save and push the model to the Hugging Face Hub
    trainer.save_model(f"./output/empathy-crossencoder-{loss_fn}")
    trainer.push_to_hub(f"minoosh/empathy-crossencoder-{loss_fn}")

    # End the wandb run
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113508911110632, max=1.0…

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/88.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/309 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [4]:
# Specify list of loss functions to try
loss_functions = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_functions[0]  # Change to desired loss function
wandb.init(project="bert-crossencoder-classification", name=f"bert-crossencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
train_crossencoder(loss_fn)
wandb.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2346,1.16736,0.469256,0.280394,0.469256,0.346311
2,0.993,1.034654,0.576052,0.611152,0.576052,0.544453
3,0.7997,0.931385,0.605178,0.611788,0.605178,0.607487
4,0.6984,0.971698,0.608414,0.616197,0.608414,0.608083
5,0.4427,1.036381,0.61165,0.613078,0.61165,0.610664
6,0.3892,1.054925,0.595469,0.603016,0.595469,0.598054
7,0.2767,1.097122,0.576052,0.585239,0.576052,0.578687


  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
HTTP Error 500 thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/2c/f7/2cf7041ff51d6d12d44fcc53fb3ebaedfd0100dc90d8d5f9ea9912f5907cbe63/00951279a1a5fd43a7393ff4aaf6cabe926e1f835bb6baa0217d1f499f05b98a?X-Amz-Algorithm=AWS4-HMA

No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.032 MB of 0.032 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆███▇▆▇
eval/f1,▁▆████▇▇
eval/loss,█▄▁▂▄▅▆▃
eval/precision,▁█████▇▇
eval/recall,▁▆███▇▆▇
eval/runtime,▁▅▇█▇█▇▅
eval/samples_per_second,█▄▂▁▂▁▂▄
eval/steps_per_second,█▄▂▁▂▁▂▄
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███

0,1
eval/accuracy,0.58442
eval/f1,0.58415
eval/loss,0.98529
eval/precision,0.58427
eval/recall,0.58442
eval/runtime,4.9666
eval/samples_per_second,62.014
eval/steps_per_second,2.013
total_flos,0.0
train/epoch,7.0


In [4]:
# Specify list of loss functions to try
loss_functions = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_functions[1]  # Change to desired loss function
wandb.init(project="bert-crossencoder-classification", name=f"bert-crossencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
train_crossencoder(loss_fn)
wandb.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1697,0.168782,0.459547,0.457934,0.459547,0.431939
2,0.1318,0.132317,0.576052,0.615534,0.576052,0.553035
3,0.11,0.118426,0.579288,0.581347,0.579288,0.576467
4,0.0876,0.119102,0.595469,0.608861,0.595469,0.598462
5,0.0589,0.122868,0.61165,0.613869,0.61165,0.611476
6,0.0462,0.126162,0.605178,0.619178,0.605178,0.608539
7,0.0337,0.130708,0.595469,0.609251,0.595469,0.600171


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆▇▇██▇▇
eval/f1,▁▆▇▇███▇
eval/loss,█▃▁▁▂▂▃▂
eval/precision,▁█▆████▇
eval/recall,▁▆▇▇██▇▇
eval/runtime,█▃▂▃▃▂▅▁
eval/samples_per_second,▁▇▇▇▇▇▄█
eval/steps_per_second,▁▆▇▆▇▇▄█
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.60065
eval/f1,0.58985
eval/loss,0.1266
eval/precision,0.59672
eval/recall,0.60065
eval/runtime,5.0018
eval/samples_per_second,61.578
eval/steps_per_second,1.999
total_flos,0.0
train/epoch,7.0


In [4]:
# Specify list of loss functions to try
loss_functions = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_functions[2]  # Change to desired loss function
wandb.init(project="bert-crossencoder-classification", name=f"bert-crossencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
train_crossencoder(loss_fn)
wandb.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2263,1.17723,0.527508,0.559952,0.527508,0.506894
2,1.0134,1.025421,0.585761,0.611601,0.585761,0.556446
3,0.8362,0.962687,0.61165,0.641748,0.61165,0.613409
4,0.7098,0.949051,0.618123,0.637895,0.618123,0.622453
5,0.5504,0.95313,0.621359,0.62669,0.621359,0.623061
6,0.4489,0.989292,0.608414,0.616237,0.608414,0.610485
7,0.3405,0.998353,0.614887,0.621817,0.614887,0.617107


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.031 MB of 0.031 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▇██▇██
eval/f1,▁▄▇██▇██
eval/loss,█▃▁▁▁▂▃▂
eval/precision,▁▅██▇▆▆▆
eval/recall,▁▅▇██▇██
eval/runtime,█▄▄▄▄▄▃▁
eval/samples_per_second,▁▅▅▅▅▅▆█
eval/steps_per_second,▁▅▅▅▅▅▆█
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇██

0,1
eval/accuracy,0.61688
eval/f1,0.61596
eval/loss,0.97487
eval/precision,0.62385
eval/recall,0.61688
eval/runtime,5.4699
eval/samples_per_second,56.308
eval/steps_per_second,1.828
total_flos,0.0
train/epoch,7.0
