In [None]:
!pip install -q transformers datasets wandb

In [None]:
!huggingface-cli login --token hf_

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import wandb
from transformers import BertConfig

# Initialize wandb
wandb.init(project="bert-crossencoder-empathy")

# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Initialize tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize both sentences together, separated by [SEP]
def preprocess_function(examples):
    # Encode both sentences as a single input
    return tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids', 'attention_mask', 'label']
tokenized_train.set_format(type='torch', columns=columns_to_keep)
tokenized_test.set_format(type='torch', columns=columns_to_keep)
tokenized_val.set_format(type='torch', columns=columns_to_keep)

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr
    }

# Custom Cosine Similarity Loss
class CosineSimilarityLoss(torch.nn.Module):
    def forward(self, predictions, targets):
        cos_sim = torch.nn.functional.cosine_similarity(predictions, targets, dim=0)
        return 1 - cos_sim.mean()  # Minimize (1 - cosine similarity) for similarity maximization

# CrossEncoder Model with SequenceClassification head
def train_crossencoder(loss_fn="mse"):
    # Load pre-trained BERT model for sequence classification with config
    config = BertConfig.from_pretrained(model_name, num_labels=1)  # Set num_labels=1 for regression
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    # Define the loss function based on the selected loss
    def custom_loss_fn(logits, labels):
        logits = logits.squeeze()  # Ensure correct shape for loss calculation
        if loss_fn == "mse":
            loss_fct = torch.nn.MSELoss()  # Mean Squared Error Loss
        elif loss_fn == "mae":
            loss_fct = torch.nn.L1Loss()  # Mean Absolute Error Loss
        elif loss_fn == "cross_entropy":
            loss_fct = torch.nn.CrossEntropyLoss()  # Cross-Entropy Loss for classification
            labels = labels.long()  # Convert labels to long for CrossEntropy
        elif loss_fn == "cosine_sim":
            loss_fct = CosineSimilarityLoss()  # Custom Cosine Similarity Loss
        return loss_fct(logits, labels)

    # Wrap the custom loss function in a Trainer-compatible format
    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            logits = outputs.get("logits")
            loss = custom_loss_fn(logits, labels)
            return (loss, outputs) if return_outputs else loss

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/crossencoder-{loss_fn}",
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        logging_dir='./logs',           # Directory for logs
        logging_steps=10,               # Log every 10 steps
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        warmup_steps=100,
        learning_rate=2e-5,
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",          # Save checkpoints at the end of each epoch
        save_total_limit=2,             # Keep only the 2 most recent checkpoints
        push_to_hub=True                # Automatically push to Hugging Face Hub
    )

    # Initialize Trainer with custom loss function
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate on the test set
    trainer.evaluate(tokenized_test)

    # Save the model locally and to Hugging Face Hub
    trainer.save_model(f"./output/crossencoder-{loss_fn}")
    trainer.push_to_hub(f"minoosh/crossencoder-{loss_fn}")

    wandb.finish()

In [None]:
# Train cross-encoders with different loss functions
loss_functions = ["mse", "mae", "cross_entropy", "cosine_sim"]

loss_fn = loss_functions[0]
# Initialize wandb with a unique run name for each loss function
wandb.init(project="bert-crossencoder-empathy", name=f"cross_encoder_{loss_fn}_run", config={"epochs": 3, "batch_size": 16, "learning_rate": 2e-5})
train_crossencoder(loss_fn)