In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
from transformers import BertConfig, BertModel
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import wandb
import numpy as np

# Initialize wandb
wandb.init(
    project="bert-biencoder-empathy"
)

# Load dataset
dataset = load_dataset("minoosh/EPITOME_pairs2")

# Initialize bi-encoder model (e.g., BERT as a sentence encoder)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_train.set_format(type='torch', columns=columns_to_keep)
tokenized_test.set_format(type='torch', columns=columns_to_keep)
tokenized_val.set_format(type='torch', columns=columns_to_keep)

# Define a custom collator to handle text1 and text2 encoding
class BiEncoderCollator:
    def __call__(self, features):
        # Pad each batch dynamically
        batch = {
            'input_ids_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
            'attention_mask_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
            'input_ids_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
            'attention_mask_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }
        '''batch = {
            'input_ids_text1': torch.stack([f['input_ids_text1'] for f in features]),
            'attention_mask_text1': torch.stack([f['attention_mask_text1'] for f in features]),
            'input_ids_text2': torch.stack([f['input_ids_text2'] for f in features]),
            'attention_mask_text2': torch.stack([f['attention_mask_text2'] for f in features]),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }'''
        return batch

collator = BiEncoderCollator()

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)
    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
        "cosine_sim": cosine_sim  # Optional metric for similarity tasks
    }

# Define a custom BiEncoder model
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, loss_fn="mse"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.cos = torch.nn.CosineSimilarity(dim=1)
        self.loss_fn = loss_fn
        self.config = config

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)

        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]

        # Calculate cosine similarity between the two embeddings
        cos_sim = self.cos(cls_embedding_text1, cls_embedding_text2)

        loss = None
        if labels is not None:
            if self.loss_fn == "mse":
                loss_fct = torch.nn.MSELoss()  # Mean Squared Error Loss
            elif self.loss_fn == "mae":
                loss_fct = torch.nn.L1Loss()  # Mean Absolute Error Loss
            elif self.loss_fn == "contrastive":
                loss_fct = self.contrastive_loss
            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()  # Cosine Embedding Loss

            if self.loss_fn == "cosine_embedding":
                labels_cosine = 2 * (labels > 0.5).float() - 1  # Convert labels to binary for cosine embedding loss
                loss = loss_fct(cls_embedding_text1, cls_embedding_text2, labels_cosine)
            else:
                loss = loss_fct(cos_sim, labels)

        return {"loss": loss, "logits": cos_sim}

    def contrastive_loss(self, cos_sim, labels, margin=0.5):
        loss = torch.mean((1 - labels) * torch.pow(cos_sim, 2) + labels * torch.pow(torch.clamp(margin - cos_sim, min=0.0), 2))
        return loss

# Initialize the Bi-Encoder model with a specific loss function
def train_biencoder(loss_fn):
    # Load pre-trained BERT configuration and model
    config = BertConfig.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

    # Initialize your custom BiEncoderModel with the BERT model and config
    bi_encoder_model = BiEncoderModel(base_model=bert_model, config=config, loss_fn=loss_fn)
    #bi_encoder_model = BiEncoderModel(base_model, loss_fn)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/empathy-biencoder-{loss_fn}_Ds2",
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        logging_dir='./logs',           # Directory for logs
        logging_steps=10,               # Log every 10 steps
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",          # Save checkpoints at the end of each epoch
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2              # Keep only the 2 most recent checkpoints
    )

    # Define the Trainer
    trainer = Trainer(
        model=bi_encoder_model,             # Custom BiEncoder model
        args=training_args,                 # Training arguments
        train_dataset=tokenized_train,      # Training dataset
        eval_dataset=tokenized_val,         # Validation dataset
        data_collator=collator,             # Custom collator for handling bi-encoder inputs
        compute_metrics=compute_metrics     # Function to compute metrics
    )

    # Train the model
    trainer.train()

    # Evaluate on the test set
    trainer.evaluate(tokenized_test)

    # Save the model to Hugging Face Hub
    trainer.save_model(f"./output/empathy-biencoder-{loss_fn}_Ds2")
    trainer.push_to_hub(f"minoosh/empathy-biencoder-{loss_fn}_Ds2")

    # Finish wandb run
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113091077778057, max=1.0…

README.md:   0%|          | 0.00/589 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/88.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/309 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [4]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[1]
wandb.init(project="bert-biencoder-empathy", name=f"bert-biencoder-empathy-{loss_fn}_Ds2", config={"epochs": 5, "batch_size": 16, "learning_rate": 2e-5})
train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.1531,0.159352,0.041124,0.159252,0.352507,0.360193,0.840801
2,0.1272,0.15288,0.036222,0.152746,0.516035,0.516792,0.868463
3,0.1015,0.139206,0.029945,0.139116,0.558192,0.562953,0.874459
4,0.0983,0.148448,0.034032,0.148342,0.564564,0.567231,0.878567
5,0.0865,0.147851,0.033814,0.147728,0.564612,0.569424,0.877721


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,▁▆▇██▇
eval/loss,█▆▃▅▅▁
eval/mae,█▆▃▅▅▁
eval/mse,█▅▁▄▃▁
eval/pearson_corr,▁▆████
eval/runtime,▁▂▂▂█▅
eval/samples_per_second,█▇▇▇▁▄
eval/spearman_corr,▁▆████
eval/steps_per_second,█▇▇▇▁▄
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████

0,1
eval/cosine_sim,0.87191
eval/loss,0.13309
eval/mae,0.13309
eval/mse,0.02984
eval/pearson_corr,0.56024
eval/runtime,7.4083
eval/samples_per_second,41.575
eval/spearman_corr,0.56971
eval/steps_per_second,1.35
total_flos,0.0


In [5]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[2]
wandb.init(project="bert-biencoder-empathy", name=f"bert-biencoder-empathy-{loss_fn}_Ds2", config={"epochs": 5, "batch_size": 16, "learning_rate": 2e-5})
train_biencoder(loss_fn)

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.054,0.062897,0.042491,0.168469,0.233082,0.253217,0.769336
2,0.0504,0.05575,0.036857,0.15727,0.360932,0.355529,0.816929
3,0.0483,0.055407,0.03569,0.155001,0.413554,0.404821,0.824564
4,0.0452,0.054082,0.035762,0.155291,0.430628,0.42302,0.829136
5,0.0448,0.053582,0.036198,0.156072,0.433215,0.42842,0.828967


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,▁▆▇▇▇█
eval/loss,█▃▃▂▂▁
eval/mae,█▄▄▄▄▁
eval/mse,█▃▂▂▃▁
eval/pearson_corr,▁▅▇███
eval/runtime,▃▃▂▂▁█
eval/samples_per_second,▆▇▇▇█▁
eval/spearman_corr,▁▄▆▇▇█
eval/steps_per_second,▆▆▇▇█▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████

0,1
eval/cosine_sim,0.83856
eval/loss,0.05214
eval/mae,0.14612
eval/mse,0.03394
eval/pearson_corr,0.44453
eval/runtime,7.8293
eval/samples_per_second,39.34
eval/spearman_corr,0.46289
eval/steps_per_second,1.277
total_flos,0.0


In [5]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[3]
wandb.init(project="bert-biencoder-empathy", name=f"bert-biencoder-empathy-{loss_fn}_Ds2", config={"epochs": 5, "batch_size": 16, "learning_rate": 2e-5})
train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.1005,0.110309,0.126254,0.295663,0.093024,0.065044,0.109258
2,0.0852,0.084715,0.134844,0.31529,0.080329,0.092797,-0.075253
3,0.0603,0.07298,0.143082,0.328031,0.12236,0.102984,-0.172735
4,0.0655,0.07259,0.132073,0.316032,0.183973,0.182827,-0.091
5,0.0853,0.07108,0.138098,0.324282,0.184668,0.179811,-0.1389


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,█▅▃▄▄▁
eval/loss,█▃▁▁▁▁
eval/mae,▁▅▇▅▇█
eval/mse,▁▄▇▃▅█
eval/pearson_corr,▂▁▄██▄
eval/runtime,▁▁▁▁▁█
eval/samples_per_second,█████▁
eval/spearman_corr,▁▃▃██▆
eval/steps_per_second,█████▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████

0,1
eval/cosine_sim,-0.29085
eval/loss,0.07384
eval/mae,0.33128
eval/mse,0.14586
eval/pearson_corr,0.12918
eval/runtime,7.7258
eval/samples_per_second,39.866
eval/spearman_corr,0.15728
eval/steps_per_second,1.294
total_flos,0.0
