In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
from transformers import BertConfig, BertModel
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import wandb
import numpy as np

# Initialize wandb
wandb.init(
    project="bert-biencoder-empathy"
)

# Load dataset
dataset = load_dataset("minoosh/EPITOME_pairs2")

# Initialize bi-encoder model (e.g., BERT as a sentence encoder)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_train.set_format(type='torch', columns=columns_to_keep)
tokenized_test.set_format(type='torch', columns=columns_to_keep)
tokenized_val.set_format(type='torch', columns=columns_to_keep)

# Define a custom collator to handle text1 and text2 encoding
class BiEncoderCollator:
    def __call__(self, features):
        # Pad each batch dynamically
        batch = {
            'input_ids_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
            'attention_mask_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
            'input_ids_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
            'attention_mask_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }
        '''batch = {
            'input_ids_text1': torch.stack([f['input_ids_text1'] for f in features]),
            'attention_mask_text1': torch.stack([f['attention_mask_text1'] for f in features]),
            'input_ids_text2': torch.stack([f['input_ids_text2'] for f in features]),
            'attention_mask_text2': torch.stack([f['attention_mask_text2'] for f in features]),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }'''
        return batch

collator = BiEncoderCollator()

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)
    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
        "cosine_sim": cosine_sim  # Optional metric for similarity tasks
    }

# Define a custom BiEncoder model
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, loss_fn="mse"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.cos = torch.nn.CosineSimilarity(dim=1)
        self.loss_fn = loss_fn
        self.config = config

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)

        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]

        # Calculate cosine similarity between the two embeddings
        cos_sim = self.cos(cls_embedding_text1, cls_embedding_text2)

        loss = None
        if labels is not None:
            if self.loss_fn == "mse":
                loss_fct = torch.nn.MSELoss()  # Mean Squared Error Loss
            elif self.loss_fn == "mae":
                loss_fct = torch.nn.L1Loss()  # Mean Absolute Error Loss
            elif self.loss_fn == "contrastive":
                loss_fct = self.contrastive_loss
            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()  # Cosine Embedding Loss

            if self.loss_fn == "cosine_embedding":
                labels_cosine = 2 * (labels > 0.5).float() - 1  # Convert labels to binary for cosine embedding loss
                loss = loss_fct(cls_embedding_text1, cls_embedding_text2, labels_cosine)
            else:
                loss = loss_fct(cos_sim, labels)

        return {"loss": loss, "logits": cos_sim}

    def contrastive_loss(self, cos_sim, labels, margin=0.5):
        loss = torch.mean((1 - labels) * torch.pow(cos_sim, 2) + labels * torch.pow(torch.clamp(margin - cos_sim, min=0.0), 2))
        return loss

# Initialize the Bi-Encoder model with a specific loss function
def train_biencoder(loss_fn):
    # Load pre-trained BERT configuration and model
    config = BertConfig.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

    # Initialize your custom BiEncoderModel with the BERT model and config
    bi_encoder_model = BiEncoderModel(base_model=bert_model, config=config, loss_fn=loss_fn)
    #bi_encoder_model = BiEncoderModel(base_model, loss_fn)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/empathy-biencoder-{loss_fn}_Ds2",
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        logging_dir='./logs',           # Directory for logs
        logging_steps=10,               # Log every 10 steps
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",          # Save checkpoints at the end of each epoch
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2              # Keep only the 2 most recent checkpoints
    )

    # Define the Trainer
    trainer = Trainer(
        model=bi_encoder_model,             # Custom BiEncoder model
        args=training_args,                 # Training arguments
        train_dataset=tokenized_train,      # Training dataset
        eval_dataset=tokenized_val,         # Validation dataset
        data_collator=collator,             # Custom collator for handling bi-encoder inputs
        compute_metrics=compute_metrics     # Function to compute metrics
    )

    # Train the model
    trainer.train()

    # Evaluate on the test set
    trainer.evaluate(tokenized_test)

    # Save the model to Hugging Face Hub
    trainer.save_model(f"./output/empathy-biencoder-{loss_fn}_Ds2")
    trainer.push_to_hub(f"minoosh/empathy-biencoder-{loss_fn}_Ds2")

    # Finish wandb run
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[0]
wandb.init(project="bert-biencoder-empathy", name=f"bert-biencoder-empathy-{loss_fn}_Ds2", config={"epochs": 5, "batch_size": 16, "learning_rate": 2e-5})
train_biencoder(loss_fn)

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.0296,0.035439,0.035439,0.151431,0.549467,0.554085,0.876148
2,0.0207,0.033078,0.033078,0.145318,0.571682,0.570007,0.881709
3,0.0161,0.029965,0.029965,0.140373,0.567467,0.564673,0.87743
4,0.0119,0.032782,0.032782,0.14569,0.566645,0.55756,0.878716
5,0.009,0.033197,0.033197,0.147239,0.569198,0.562387,0.878575


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text1']) for f in

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
eval/cosine_sim,▁█▃▄▄▄
eval/loss,█▅▁▅▅▁
eval/mae,█▅▃▅▆▁
eval/mse,█▅▁▅▅▁
eval/pearson_corr,▁▅▄▄▅█
eval/runtime,▂▂▂▂▁█
eval/samples_per_second,▇▇▇▇█▁
eval/spearman_corr,▁▄▃▂▂█
eval/steps_per_second,▇▇▇▇█▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████

0,1
eval/cosine_sim,0.87866
eval/loss,0.02989
eval/mae,0.13558
eval/mse,0.02989
eval/pearson_corr,0.58575
eval/runtime,15.2168
eval/samples_per_second,20.241
eval/spearman_corr,0.59531
eval/steps_per_second,1.314
total_flos,0.0
