In [1]:
!pip install -q transformers datasets wandb sentence-transformers



In [2]:
!huggingface-cli login --token hf_

'huggingface-cli' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
from sentence_transformers import CrossEncoder

# Initialize CrossEncoder model (BERT-based) with regression output
model = CrossEncoder('bert-base-uncased', num_labels=1)

  from tqdm.autonotebook import tqdm, trange
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import wandb

# Initialize wandb
wandb.init(
    project="TESTbert-crossencoder-empathy",
    config={"epochs": 3, "batch_size": 16, "learning_rate": 2e-5}
)

In [26]:
from datasets import load_dataset
from sentence_transformers import InputExample

dataset = load_dataset("minoosh/Annotated_story_pairs2")

train_examples = [InputExample(texts=[row['text1'], row['text2']], label=row['label']) for row in dataset['train']]
val_examples = [InputExample(texts=[row['text1'], row['text2']], label=row['label']) for row in dataset['validation']]

# Tokenize the dataset for CrossEncoder
def preprocess_function(examples):
    return model.tokenizer(
        examples['text1'],
        examples['text2'],
        truncation=True,
        padding=True,
        max_length=512
    )

# Apply tokenization to each split in the dataset
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unneeded columns and set format for PyTorch
tokenized_train = tokenized_train.remove_columns(["text1", "text2"])
tokenized_test = tokenized_test.remove_columns(["text1", "text2"])
tokenized_val = tokenized_val.remove_columns(["text1", "text2"])

'''# Convert to PyTorch tensors
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")
tokenized_val.set_format("torch")'''

'# Convert to PyTorch tensors\ntokenized_train.set_format("torch")\ntokenized_test.set_format("torch")\ntokenized_val.set_format("torch")'

In [32]:
from transformers import TrainingArguments, Trainer
from MyCEmetrics import compute_metrics

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./output/empathy-crossencoder",  # Save model here
    eval_strategy="steps",                 # Evaluate model every few steps
    eval_steps=40,                              # Evaluation frequency
    logging_dir='./logs',                        # Directory for logs
    logging_steps=10,                            # Log every 10 steps
    per_device_train_batch_size=wandb.config['batch_size'],
    per_device_eval_batch_size=wandb.config['batch_size'],
    num_train_epochs=wandb.config['epochs'],
    warmup_steps=100,                            # Warmup steps
    learning_rate=wandb.config['learning_rate'],  # Learning rate
    weight_decay=0.01,                           # Prevent overfitting
    report_to="wandb",                           # Log to wandb
    save_steps=40,                              # Save every 40 steps
    save_total_limit=2,                          # Save only the 2 latest checkpoints
    load_best_model_at_end=True,                 # Load the best model at the end
    metric_for_best_model="mse",                 # Use MSE as the metric
)


# Define the Trainer for training the CrossEncoder model
trainer = Trainer(
    model=model.model,                    # CrossEncoder model (inner model)
    args=training_args,                   # Training arguments
    train_dataset=tokenized_train,         # Training dataset
    eval_dataset=tokenized_val,            # Validation dataset
    tokenizer=model.tokenizer,            # Tokenizer from CrossEncoder
    compute_metrics=compute_metrics,       # Function to compute metrics
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,R2
40,0.1038,0.074989,0.074989,0.229586,0.06926,0.053132,-0.033615
80,0.0962,0.076002,0.076002,0.228981,0.229442,0.103004,-0.047581
120,0.1048,0.096649,0.096649,0.249318,0.260964,0.208432,-0.332173


TrainOutput(global_step=123, training_loss=0.10035543180093533, metrics={'train_runtime': 251.1591, 'train_samples_per_second': 7.68, 'train_steps_per_second': 0.49, 'total_flos': 507536668781568.0, 'train_loss': 0.10035543180093533, 'epoch': 3.0})

In [None]:
import pickle

# Evaluate on the test set
trainer.evaluate(tokenized_test)

# Save the model to Hugging Face Hub
model.push_to_hub("minoosh/crossencoder-empathy-model1")
model.tokenizer.push_to_hub("minoosh/crossencoder-empathy-model1")

# Get predictions on the test set
predictions_output = trainer.predict(tokenized_test)

# Finish wandb run
wandb.finish()

with open('predictions.pkl', 'wb') as f:
    pickle.dump(predictions_output, f)