In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import wandb

# Initialize wandb
wandb.init(
    project="bert-crossencoder-regression"
)

# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Initialize the tokenizer and model for cross-encoder setup
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]
def preprocess_function(examples):
    # Concatenate both texts with a [SEP] token in between
    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)
    encodings['labels'] = examples['label']
    return encodings

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define compute_metrics function for regression evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)
    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
        "cosine_sim": cosine_sim  # Optional metric for similarity tasks
    }

# Custom Cross-Encoder model class with config
class CrossEncoderModel(torch.nn.Module):
    def __init__(self, model_name, loss_fn="mse"):
        super(CrossEncoderModel, self).__init__()
        # Load model config
        self.config = AutoConfig.from_pretrained(model_name, num_labels=1)  # Specify 1 output for regression
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)
        self.loss_fn = loss_fn

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze()  # Output logits for regression

        loss = None
        if labels is not None:
            if self.loss_fn == "mse":
                loss_fct = torch.nn.MSELoss()
            elif self.loss_fn == "mae":
                loss_fct = torch.nn.L1Loss()
            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()
                labels_cosine = 2 * (labels > 0.5).float() - 1  # Convert to binary for cosine embedding loss
                return loss_fct(logits, labels_cosine)
            elif self.loss_fn == "contrastive":
                loss_fct = self.contrastive_loss
            else:
                raise ValueError(f"Unknown loss function: {self.loss_fn}")

            if self.loss_fn == "cosine_embedding":
                loss = loss_fct(logits, labels_cosine)
            else:
                loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}

    def contrastive_loss(self, logits, labels, margin=0.5):
        positive_pairs = labels * torch.pow(1 - logits, 2)  # For similar pairs (y=1)
        negative_pairs = (1 - labels) * torch.pow(torch.clamp(margin - logits, min=0.0), 2)  # For dissimilar pairs (y=0)
        return torch.mean(positive_pairs + negative_pairs)

# Function to initialize and train the cross-encoder model
def train_crossencoder(loss_fn):
    # Initialize the cross-encoder model with the specified loss function
    model = CrossEncoderModel(model_name=model_name, loss_fn=loss_fn)

    # Set up TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/bert-reg-crossencoder-{loss_fn}",
        evaluation_strategy="epoch",
        logging_dir='./logs',
        logging_steps=10,
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    #trainer.evaluate(tokenized_test)

    trainer.model = trainer.model.model

    # Save and push the model to the Hugging Face Hub
    trainer.save_model(f"./output/bert-reg-crossencoder-{loss_fn}")
    trainer.push_to_hub(f"minoosh/bert-reg-crossencoder-{loss_fn}")

    # End the wandb run
    wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Specify list of loss functions to try
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]

loss_fn = loss_functions[0]
wandb.init(project="bert-crossencoder-regression", name=f"bert-crossencoder-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
train_crossencoder(loss_fn)
wandb.finish()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.124,0.091279,0.091279,0.253694,0.146698,0.133682,0.904111
2,0.0985,0.075166,0.075166,0.224198,0.101012,0.072596,0.901816
3,0.1071,0.071218,0.071218,0.215689,0.228309,0.205325,0.907049
4,0.0785,0.114128,0.114128,0.263189,0.161349,0.126939,0.895736
5,0.0773,0.088915,0.088915,0.24045,0.140914,0.125374,0.880655
6,0.0664,0.091903,0.091903,0.237298,0.228223,0.142802,0.8981
7,0.0523,0.106984,0.106984,0.259456,0.182481,0.131917,0.88966


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


No files have been modified since last commit. Skipping to prevent empty commit.


0,1
eval/cosine_sim,▇▇█▅▁▆▃▄
eval/loss,▄▂▁█▄▄▇▄
eval/mae,▇▂▁█▅▄▇▇
eval/mse,▄▂▁█▄▄▇▄
eval/pearson_corr,▃▁▆▃▃▆▄█
eval/runtime,▁▅▇█▇▆▅▄
eval/samples_per_second,█▃▂▁▂▃▄▄
eval/spearman_corr,▄▁▇▃▃▄▄█
eval/steps_per_second,█▆▅▅▅▆▆▁
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████

0,1
eval/cosine_sim,0.8917
eval/loss,0.09104
eval/mae,0.25734
eval/mse,0.09104
eval/pearson_corr,0.28519
eval/runtime,2.3252
eval/samples_per_second,34.406
eval/spearman_corr,0.22604
eval/steps_per_second,2.15
total_flos,0.0
