In [6]:
!pip install -q transformers datasets wandb

In [7]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [21]:
import torch

from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TrainingArguments, Trainer

from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.stats import pearsonr, spearmanr

import wandb



# Initialize wandb

wandb.init(

    project="bert-crossencoder-regression"

)



# Load dataset

dataset = load_dataset("minoosh/Annotated_story_pairs2")



# Initialize the tokenizer and model for cross-encoder setup

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)



# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]

def preprocess_function(examples):

    # Concatenate both texts with a [SEP] token in between

    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

    encodings['labels'] = examples['label']

    return encodings



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Set format for PyTorch

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



# Define compute_metrics function for regression evaluation

def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = predictions.squeeze()

    labels = labels.squeeze()



    mse = mean_squared_error(labels, predictions)

    mae = mean_absolute_error(labels, predictions)

    pearson_corr, _ = pearsonr(predictions, labels)

    spearman_corr, _ = spearmanr(predictions, labels)

    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()



    return {

        "mse": mse,

        "mae": mae,

        "pearson_corr": pearson_corr,

        "spearman_corr": spearman_corr,

        "cosine_sim": cosine_sim  # Optional metric for similarity tasks

    }



# Custom Cross-Encoder model class with config

class CrossEncoderModel(torch.nn.Module):

    def __init__(self, model_name, loss_fn="mse"):

        super(CrossEncoderModel, self).__init__()

        # Load model config

        self.config = AutoConfig.from_pretrained(model_name, num_labels=1)  # Specify 1 output for regression

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)

        self.loss_fn = loss_fn



    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits.squeeze()  # Output logits for regression



        loss = None

        if labels is not None:

            if self.loss_fn == "mse":

                loss_fct = torch.nn.MSELoss()

            elif self.loss_fn == "mae":

                loss_fct = torch.nn.L1Loss()

            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()
                labels_cosine = 2 * (labels > 0.5).float() - 1  # Convert to binary for cosine embedding loss
            
                # Make sure to provide a target similarity score (1 for similar, -1 for dissimilar)
                # Assuming you need to compute the target based on labels
                target = labels_cosine  # This can also be -1 or 1 depending on your implementatio
            elif self.loss_fn == "contrastive":
                loss_fct = self.contrastive_loss
            else:
                raise ValueError(f"Unknown loss function: {self.loss_fn}")

            if self.loss_fn == "cosine_embedding":
                loss = loss_fct(logits, target)  # Compute loss
                print("Logits shape:", logits.shape)
                print("Labels cosine shape:", labels_cosine.shape)
            else:
                loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}



    def contrastive_loss(self, logits, labels, margin=0.5):

        positive_pairs = labels * torch.pow(1 - logits, 2)  # For similar pairs (y=1)

        negative_pairs = (1 - labels) * torch.pow(torch.clamp(margin - logits, min=0.0), 2)  # For dissimilar pairs (y=0)

        return torch.mean(positive_pairs + negative_pairs)



# Function to initialize and train the cross-encoder model

def train_crossencoder(loss_fn):

    # Initialize the cross-encoder model with the specified loss function

    model = CrossEncoderModel(model_name=model_name, loss_fn=loss_fn)



    # Set up TrainingArguments

    training_args = TrainingArguments(

        output_dir=f"./output/bert-reg-crossencoder-{loss_fn}",

        evaluation_strategy="epoch",

        logging_dir='./logs',

        logging_steps=10,

        per_device_train_batch_size=wandb.config['batch_size'],

        per_device_eval_batch_size=wandb.config['batch_size'],

        num_train_epochs=wandb.config['epochs'],

        warmup_steps=100,

        learning_rate=wandb.config['learning_rate'],

        weight_decay=0.01,

        report_to="wandb",

        save_strategy="epoch",

        load_best_model_at_end=True,

        push_to_hub=True,

        save_total_limit=2

    )



    # Initialize Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_train,

        eval_dataset=tokenized_val,

        tokenizer=tokenizer,

        compute_metrics=compute_metrics

    )



    # Train the model

    trainer.train()



    # Evaluate the model on the test set

    #trainer.evaluate(tokenized_test)



    trainer.model = trainer.model.model



    # Save and push the model to the Hugging Face Hub

    trainer.save_model(f"./output/bert-reg-crossencoder-{loss_fn}")

    trainer.push_to_hub(f"minoosh/bert-reg-crossencoder-{loss_fn}")



    # End the wandb run

    wandb.finish()

VBox(children=(Label(value='0.024 MB of 0.024 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



In [4]:
# Specify list of loss functions to try

loss_functions = ["mse", "mae", "contrastive"]#, "cosine_embedding"]



loss_fn = loss_functions[0]

wandb.init(project="bert-crossencoder-regression", name=f"bert-crossencoder-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

train_crossencoder(loss_fn)

wandb.finish()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.1034,0.070363,0.070363,0.219833,0.191383,0.242899,0.907023
2,0.097,0.073883,0.073883,0.216097,0.218461,0.220823,0.905907
3,0.0877,0.066262,0.066262,0.215413,0.321354,0.242605,0.913325
4,0.0679,0.072344,0.072344,0.205441,0.372181,0.338231,0.917484
5,0.0569,0.064429,0.064429,0.205819,0.386706,0.355188,0.915534
6,0.0408,0.077298,0.077298,0.210162,0.404528,0.310506,0.918981
7,0.0317,0.075221,0.075221,0.211979,0.393699,0.317778,0.916348


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
No files have been modified since last commit. Skipping to prevent empty commit.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
eval/cosine_sim,▂▁▅▇▆█▇
eval/loss,▄▆▂▅▁█▇
eval/mae,█▆▆▁▁▃▄
eval/mse,▄▆▂▅▁█▇
eval/pearson_corr,▁▂▅▇▇██
eval/runtime,▁█▅▆▄▃▇
eval/samples_per_second,█▁▄▃▅▆▂
eval/spearman_corr,▂▁▂▇█▆▆
eval/steps_per_second,█▁▄▃▅▆▂
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/cosine_sim,0.91635
eval/loss,0.07522
eval/mae,0.21198
eval/mse,0.07522
eval/pearson_corr,0.3937
eval/runtime,2.2993
eval/samples_per_second,35.228
eval/spearman_corr,0.31778
eval/steps_per_second,2.61
total_flos,0.0


In [4]:
# Specify list of loss functions to try

loss_functions = ["mse", "mae", "contrastive"]#, "cosine_embedding"]



loss_fn = loss_functions[1]

wandb.init(project="bert-crossencoder-regression", name=f"bert-crossencoder-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

train_crossencoder(loss_fn)

wandb.finish()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.2886,0.221346,0.074205,0.221346,0.065045,0.060357,0.90374
2,0.2582,0.222265,0.071363,0.222265,0.1319,0.141743,0.905233
3,0.2615,0.209426,0.066962,0.209426,0.285867,0.275261,0.911324
4,0.2247,0.215159,0.073347,0.215159,0.312614,0.270483,0.907462
5,0.1942,0.236252,0.089041,0.236252,0.363061,0.342385,0.911215
6,0.1758,0.219324,0.077558,0.219324,0.352802,0.324698,0.910648
7,0.166,0.220021,0.078092,0.220021,0.346116,0.31293,0.905029


  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
eval/cosine_sim,▁▂█▄█▇▂
eval/loss,▄▄▁▂█▄▄
eval/mae,▄▄▁▂█▄▄
eval/mse,▃▂▁▃█▄▅
eval/pearson_corr,▁▃▆▇███
eval/runtime,▆▁▁▁▁▁█
eval/samples_per_second,▃█████▁
eval/spearman_corr,▁▃▆▆██▇
eval/steps_per_second,▃█████▁
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/cosine_sim,0.90503
eval/loss,0.22002
eval/mae,0.22002
eval/mse,0.07809
eval/pearson_corr,0.34612
eval/runtime,2.3617
eval/samples_per_second,34.297
eval/spearman_corr,0.31293
eval/steps_per_second,2.541
total_flos,0.0


In [8]:
# Specify list of loss functions to try

loss_functions = ["mse", "mae", "contrastive"]#, "cosine_embedding"]



loss_fn = loss_functions[2]

wandb.init(project="bert-crossencoder-regression", name=f"bert-crossencoder-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

train_crossencoder(loss_fn)

wandb.finish()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.0148,0.001377,0.288885,0.461431,-0.124305,-0.062464,0.900278
2,0.0096,0.007441,0.370551,0.545065,-0.043317,-0.034656,0.903027
3,0.0059,0.000123,0.254944,0.42848,-0.037221,-0.058474,0.903231
4,0.004,0.002271,0.317473,0.493974,-0.078334,-0.071454,0.902928
5,0.0026,0.000286,0.276956,0.451916,-0.030789,-0.007025,0.903333
6,0.0019,0.000238,0.277121,0.451176,-0.188415,-0.180459,0.902765
7,0.0018,0.000129,0.271716,0.445143,-0.203379,-0.195299,0.902723


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


0,1
eval/cosine_sim,▁▇█▇█▇▇
eval/loss,▂█▁▃▁▁▁
eval/mae,▃█▁▅▂▂▂
eval/mse,▃█▁▅▂▂▂
eval/pearson_corr,▄▇█▆█▂▁
eval/runtime,▁▃▅▄▃▄█
eval/samples_per_second,█▆▄▅▆▅▁
eval/spearman_corr,▆▇▆▆█▂▁
eval/steps_per_second,█▆▄▅▆▅▁
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/cosine_sim,0.90272
eval/loss,0.00013
eval/mae,0.44514
eval/mse,0.27172
eval/pearson_corr,-0.20338
eval/runtime,2.3119
eval/samples_per_second,35.037
eval/spearman_corr,-0.1953
eval/steps_per_second,2.595
total_flos,0.0
