In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
from transformers import BertConfig, BertModel
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import wandb
import numpy as np

# Initialize wandb
wandb.init(
    project="bert-biencoder-regression"
)


# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")


# Initialize bi-encoder model (e.g., BERT as a sentence encoder)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)


# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }



# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
#tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_train.set_format(type='torch', columns=columns_to_keep)
#tokenized_test.set_format(type='torch', columns=columns_to_keep)
tokenized_val.set_format(type='torch', columns=columns_to_keep)



# Define a custom collator to handle text1 and text2 encoding
class BiEncoderCollator:
    def __call__(self, features):
        batch = {
            'input_ids_text1': torch.stack([f['input_ids_text1'] for f in features]),
            'attention_mask_text1': torch.stack([f['attention_mask_text1'] for f in features]),
            'input_ids_text2': torch.stack([f['input_ids_text2'] for f in features]),
            'attention_mask_text2': torch.stack([f['attention_mask_text2'] for f in features]),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }
        return batch


collator = BiEncoderCollator()


# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.squeeze()
    labels = labels.squeeze()

    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)
    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
        "cosine_sim": cosine_sim  # Optional metric for similarity tasks
    }


# Define a custom BiEncoder model
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, loss_fn="mse"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.cos = torch.nn.CosineSimilarity(dim=1)
        self.loss_fn = loss_fn
        self.config = config


    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)


        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]


        # Calculate cosine similarity between the two embeddings
        cos_sim = self.cos(cls_embedding_text1, cls_embedding_text2)

        loss = None
        if labels is not None:
            if self.loss_fn == "mse":
                loss_fct = torch.nn.MSELoss()  # Mean Squared Error Loss
            elif self.loss_fn == "mae":
                loss_fct = torch.nn.L1Loss()  # Mean Absolute Error Loss
            elif self.loss_fn == "contrastive":
                loss_fct = self.contrastive_loss
            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()  # Cosine Embedding Loss


            if self.loss_fn == "cosine_embedding":
                labels_cosine = 2 * (labels > 0.5).float() - 1  # Convert labels to binary for cosine embedding loss
                loss = loss_fct(cls_embedding_text1, cls_embedding_text2, labels_cosine)
            else:
                loss = loss_fct(cos_sim, labels)

        return {"loss": loss, "logits": cos_sim}


    def contrastive_loss(self, cos_sim, labels, margin=0.5):
        loss = torch.mean((1 - labels) * torch.pow(cos_sim, 2) + labels * torch.pow(torch.clamp(margin - cos_sim, min=0.0), 2))
        return loss



# Initialize the Bi-Encoder model with a specific loss function
def train_biencoder(loss_fn):
    # Load pre-trained BERT configuration and model
    config = BertConfig.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

    # Initialize your custom BiEncoderModel with the BERT model and config
    bi_encoder_model = BiEncoderModel(base_model=bert_model, config=config, loss_fn=loss_fn)
    #bi_encoder_model = BiEncoderModel(base_model, loss_fn)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/bert-reg-biencoder-{loss_fn}",
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        logging_dir='./logs',           # Directory for logs
        logging_steps=10,               # Log every 10 steps
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",          # Save checkpoints at the end of each epoch
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2              # Keep only the 2 most recent checkpoints

    )


    # Define the Trainer
    trainer = Trainer(
        model=bi_encoder_model,             # Custom BiEncoder model
        args=training_args,                 # Training arguments
        train_dataset=tokenized_train,      # Training dataset
        eval_dataset=tokenized_val,         # Validation dataset
        data_collator=collator,             # Custom collator for handling bi-encoder inputs
        compute_metrics=compute_metrics     # Function to compute metrics
    )

    # Train the model
    trainer.train()


    # Evaluate the model on the test set
    #trainer.evaluate(tokenized_test)

    # Save the model to Hugging Face Hub
    trainer.save_model(f"./output/bert-reg-biencoder-{loss_fn}")
    trainer.push_to_hub(f"minoosh/bert-reg-biencoder-{loss_fn}")



    # Finish wandb run
    wandb.finish()

    return trainer

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113643588888382, max=1.0…

README.md:   0%|          | 0.00/589 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/118k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/118k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/643 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/80 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/81 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/643 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

# 0

In [4]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[0]
wandb.init(project="bert-biencoder-regression", name=f"bert-biencoder-regression-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.1219,0.112369,0.111656,0.255983,0.140632,0.099309,0.905528
2,0.1017,0.083762,0.083299,0.224776,0.131244,0.123856,0.904485
3,0.0872,0.077782,0.077472,0.220506,0.251986,0.137354,0.909682
4,0.0694,0.085977,0.085593,0.232805,0.192284,0.145615,0.903664
5,0.0533,0.095826,0.095081,0.241779,0.308914,0.225189,0.913172
6,0.0478,0.078237,0.077783,0.221579,0.291349,0.232497,0.909628
7,0.0385,0.08166,0.081212,0.227752,0.283509,0.233073,0.909724


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,▂▂▅▁█▅▅
eval/loss,█▂▁▃▅▁▂
eval/mae,█▂▁▃▅▁▂
eval/mse,█▂▁▃▅▁▂
eval/pearson_corr,▁▁▆▃█▇▇
eval/runtime,▁▇▆▅▅▅█
eval/samples_per_second,█▂▃▄▄▄▁
eval/spearman_corr,▁▂▃▃███
eval/steps_per_second,█▂▃▄▄▄▁
train/epoch,▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███

0,1
eval/cosine_sim,0.90972
eval/loss,0.08166
eval/mae,0.22775
eval/mse,0.08121
eval/pearson_corr,0.28351
eval/runtime,2.512
eval/samples_per_second,32.246
eval/spearman_corr,0.23307
eval/steps_per_second,1.194
total_flos,0.0


In [6]:
tr.tokenizer = tokenizer
repo_id = f"minoosh/bert-reg-biencoder-{loss_fn}" 
save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-reg-biencoder-mse...
Saving tokenizer...
Pushing to hub at minoosh/bert-reg-biencoder-mse...


- The pipeline tag "text-similarity" is not in the official list: text-classification, token-classification, table-question-answering, question-answering, zero-shot-classification, translation, summarization, feature-extraction, text-generation, text2text-generation, fill-mask, sentence-similarity, text-to-speech, text-to-audio, automatic-speech-recognition, audio-to-audio, audio-classification, voice-activity-detection, depth-estimation, image-classification, object-detection, image-segmentation, text-to-image, image-to-text, image-to-image, image-to-video, unconditional-image-generation, video-classification, reinforcement-learning, robotics, tabular-classification, tabular-regression, tabular-to-text, table-to-text, multiple-choice, text-retrieval, time-series-forecasting, text-to-video, image-text-to-text, visual-question-answering, document-question-answering, zero-shot-image-classification, graph-ml, mask-generation, zero-shot-object-detection, text-to-3d, image-to-3d, image-feat

Successfully pushed model to minoosh/bert-reg-biencoder-mse


In [7]:
# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    #tokenizer = loaded_tokenizer
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_test.set_format(type='torch', columns=columns_to_keep)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [8]:
wandb.init()
tr.predict(tokenized_test)

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([0.5411869 , 0.72022855, 0.55109036, 0.7318852 , 0.6615393 ,
       0.6674884 , 0.6390757 , 0.54861987, 0.6876057 , 0.7989178 ,
       0.65871495, 0.7100351 , 0.7401358 , 0.701087  , 0.632648  ,
       0.6298427 , 0.6468965 , 0.51127005, 0.5996416 , 0.58648777,
       0.6594692 , 0.71406525, 0.70586884, 0.62428755, 0.7296514 ,
       0.7332828 , 0.6785795 , 0.6275251 , 0.68053985, 0.6697133 ,
       0.78167963, 0.667819  , 0.48748174, 0.6629039 , 0.56138086,
       0.71961564, 0.74248636, 0.6166283 , 0.6198367 , 0.5173968 ,
       0.51861715, 0.6012773 , 0.39831746, 0.70240736, 0.7234752 ,
       0.75781965, 0.60142493, 0.74426496, 0.60273045, 0.72714597,
       0.7692679 , 0.62940055, 0.7208171 , 0.7243372 , 0.69598484,
       0.74242675, 0.6271779 , 0.5507619 , 0.73643994, 0.71929026,
       0.7324075 , 0.41290402, 0.69370973, 0.77956635, 0.72937536,
       0.49146456, 0.74269587, 0.3367164 , 0.79455215, 0.6604707 ,
       0.58996105, 0.7397574 , 0.

# 1

In [5]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[1]
wandb.init(project="bert-biencoder-regression", name=f"bert-biencoder-regression-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.2846,0.261682,0.115271,0.26097,0.132692,0.093613,0.905303
2,0.2728,0.230977,0.088586,0.23036,0.01876,0.031608,0.899428
3,0.2511,0.228228,0.084699,0.227604,0.171599,0.111065,0.905807
4,0.2253,0.233306,0.086442,0.232866,0.19062,0.119114,0.904066
5,0.1993,0.232854,0.082225,0.232499,0.23029,0.124551,0.901608
6,0.1844,0.235681,0.082849,0.235202,0.228399,0.125433,0.901838
7,0.165,0.234043,0.081939,0.233532,0.247543,0.132941,0.902216


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,▇▁█▆▃▄▄
eval/loss,█▂▁▂▂▃▂
eval/mae,█▂▁▂▂▃▂
eval/mse,█▂▂▂▁▁▁
eval/pearson_corr,▄▁▆▆▇▇█
eval/runtime,▁▇████▇
eval/samples_per_second,█▂▁▁▁▁▂
eval/spearman_corr,▅▁▆▇▇▇█
eval/steps_per_second,█▂▁▁▁▁▂
train/epoch,▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███

0,1
eval/cosine_sim,0.90222
eval/loss,0.23404
eval/mae,0.23353
eval/mse,0.08194
eval/pearson_corr,0.24754
eval/runtime,2.6546
eval/samples_per_second,30.514
eval/spearman_corr,0.13294
eval/steps_per_second,1.13
total_flos,0.0


In [6]:
tr.tokenizer = tokenizer
repo_id = f"minoosh/bert-reg-biencoder-{loss_fn}" 
save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-reg-biencoder-mae...
Saving tokenizer...
Pushing to hub at minoosh/bert-reg-biencoder-mae...


- The pipeline tag "text-similarity" is not in the official list: text-classification, token-classification, table-question-answering, question-answering, zero-shot-classification, translation, summarization, feature-extraction, text-generation, text2text-generation, fill-mask, sentence-similarity, text-to-speech, text-to-audio, automatic-speech-recognition, audio-to-audio, audio-classification, voice-activity-detection, depth-estimation, image-classification, object-detection, image-segmentation, text-to-image, image-to-text, image-to-image, image-to-video, unconditional-image-generation, video-classification, reinforcement-learning, robotics, tabular-classification, tabular-regression, tabular-to-text, table-to-text, multiple-choice, text-retrieval, time-series-forecasting, text-to-video, image-text-to-text, visual-question-answering, document-question-answering, zero-shot-image-classification, graph-ml, mask-generation, zero-shot-object-detection, text-to-3d, image-to-3d, image-feat

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-reg-biencoder-mae


In [7]:
# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    #tokenizer = loaded_tokenizer
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_test.set_format(type='torch', columns=columns_to_keep)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [8]:
wandb.init()
tr.predict(tokenized_test)

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([0.60868603, 0.76147115, 0.63553685, 0.7573016 , 0.6355754 ,
       0.66637826, 0.629447  , 0.5855908 , 0.6867771 , 0.7636793 ,
       0.6991831 , 0.65475047, 0.7566673 , 0.6781672 , 0.6747706 ,
       0.63560665, 0.59526014, 0.5547283 , 0.63644236, 0.60931593,
       0.6662388 , 0.71907336, 0.7167965 , 0.6326246 , 0.75798404,
       0.71789443, 0.69856375, 0.64524287, 0.6728792 , 0.7459571 ,
       0.76497203, 0.6523702 , 0.47621325, 0.6602509 , 0.727394  ,
       0.7407073 , 0.759524  , 0.65505433, 0.6859933 , 0.6314428 ,
       0.6089138 , 0.6519295 , 0.62294126, 0.6884976 , 0.73099774,
       0.79346174, 0.6113372 , 0.7708464 , 0.6391624 , 0.7504729 ,
       0.77726305, 0.7240142 , 0.7362951 , 0.7212137 , 0.70402807,
       0.7742696 , 0.6966531 , 0.6064799 , 0.74522364, 0.76917815,
       0.7406305 , 0.52029717, 0.70760185, 0.75581133, 0.73019034,
       0.6023003 , 0.7116103 , 0.40294296, 0.83097863, 0.6867243 ,
       0.5872477 , 0.77481234, 0.

# 2

In [4]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[2]
wandb.init(project="bert-biencoder-regression", name=f"bert-biencoder-regression-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.2152,0.22763,0.097925,0.238648,0.10758,0.100026,0.904071
2,0.0862,0.086637,0.114746,0.290732,0.031203,0.050001,0.868783
3,0.0663,0.078817,0.106616,0.281235,0.103186,0.126869,0.890286
4,0.0569,0.080645,0.104658,0.281388,0.126067,0.138566,0.889682
5,0.0524,0.080121,0.111437,0.285503,0.110318,0.103321,0.881939
6,0.0487,0.080797,0.111748,0.287061,0.118664,0.09506,0.880619
7,0.0451,0.080986,0.114097,0.291096,0.102437,0.084281,0.877414


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,█▁▅▅▄▃▃
eval/loss,█▁▁▁▁▁▁
eval/mae,▁█▇▇▇▇█
eval/mse,▁█▅▄▇▇█
eval/pearson_corr,▇▁▆█▇▇▆
eval/runtime,▁▃▅▇██▇
eval/samples_per_second,█▆▄▂▁▁▂
eval/spearman_corr,▅▁▇█▅▅▄
eval/steps_per_second,█▆▄▂▁▁▂
train/epoch,▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███

0,1
eval/cosine_sim,0.87741
eval/loss,0.08099
eval/mae,0.2911
eval/mse,0.1141
eval/pearson_corr,0.10244
eval/runtime,2.4795
eval/samples_per_second,32.668
eval/spearman_corr,0.08428
eval/steps_per_second,1.21
total_flos,0.0


In [6]:
tr.tokenizer = tokenizer
repo_id = f"minoosh/bert-reg-biencoder-{loss_fn}" 
save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-reg-biencoder-contrastive...
Saving tokenizer...
Pushing to hub at minoosh/bert-reg-biencoder-contrastive...


- The pipeline tag "text-similarity" is not in the official list: text-classification, token-classification, table-question-answering, question-answering, zero-shot-classification, translation, summarization, feature-extraction, text-generation, text2text-generation, fill-mask, sentence-similarity, text-to-speech, text-to-audio, automatic-speech-recognition, audio-to-audio, audio-classification, voice-activity-detection, depth-estimation, image-classification, object-detection, image-segmentation, text-to-image, image-to-text, image-to-image, image-to-video, unconditional-image-generation, video-classification, reinforcement-learning, robotics, tabular-classification, tabular-regression, tabular-to-text, table-to-text, multiple-choice, text-retrieval, time-series-forecasting, text-to-video, image-text-to-text, visual-question-answering, document-question-answering, zero-shot-image-classification, graph-ml, mask-generation, zero-shot-object-detection, text-to-3d, image-to-3d, image-feat

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-reg-biencoder-contrastive


In [8]:
# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    #tokenizer = loaded_tokenizer
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_test.set_format(type='torch', columns=columns_to_keep)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [11]:
wandb.init()
tr.predict(tokenized_test)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([0.30174786, 0.3470901 , 0.46558392, 0.33070868, 0.38689995,
       0.3594342 , 0.30435553, 0.37352902, 0.28839645, 0.5199138 ,
       0.25260442, 0.38173229, 0.3256558 , 0.29137397, 0.31153888,
       0.38343948, 0.30422994, 0.28533125, 0.2705326 , 0.27401757,
       0.3912194 , 0.52563477, 0.31145102, 0.2694178 , 0.43835   ,
       0.45095843, 0.3691133 , 0.24168783, 0.3217611 , 0.30706072,
       0.41727334, 0.3716106 , 0.32730258, 0.4408971 , 0.5407703 ,
       0.4682087 , 0.48464617, 0.36408383, 0.43082076, 0.3985268 ,
       0.36366433, 0.334626  , 0.33188298, 0.43867353, 0.38331282,
       0.46235886, 0.16892852, 0.31238768, 0.33027142, 0.29585904,
       0.49059904, 0.30969012, 0.50948894, 0.2946757 , 0.3532772 ,
       0.3430931 , 0.36803997, 0.2926734 , 0.47204572, 0.50095546,
       0.41397455, 0.3904289 , 0.3625942 , 0.58675456, 0.43666878,
       0.18831   , 0.40957376, 0.30673474, 0.4662912 , 0.4768079 ,
       0.26910836, 0.39610055, 0.

# 3

In [5]:
# Train bi-encoder with different loss functions
loss_functions = ["mse", "mae", "contrastive", "cosine_embedding"]
loss_fn = loss_functions[3]
wandb.init(project="bert-biencoder-regression", name=f"bert-biencoder-regression-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Mse,Mae,Pearson Corr,Spearman Corr,Cosine Sim
1,0.5366,0.521217,0.136733,0.291817,0.107817,0.104922,0.90468
2,0.5294,0.506096,0.099628,0.239027,0.138539,0.151546,0.904774
3,0.4811,0.486177,0.085464,0.238144,0.132545,0.08993,0.885686
4,0.4292,0.44941,0.151136,0.320175,0.215457,0.177835,0.787157
5,0.3672,0.451259,0.147007,0.30673,0.220642,0.184861,0.797957
6,0.3259,0.464371,0.220932,0.390079,0.18635,0.166244,0.669631
7,0.2866,0.460768,0.2163,0.372918,0.181961,0.161866,0.69408


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
No files have been modified since last commit. Skipping to prevent empty commit.


VBox(children=(Label(value='0.030 MB of 0.030 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/cosine_sim,██▇▄▅▁▂
eval/loss,█▇▅▁▁▂▂
eval/mae,▃▁▁▅▄█▇
eval/mse,▄▂▁▄▄██
eval/pearson_corr,▁▃▃██▆▆
eval/runtime,▁█▅▅▇▄▅
eval/samples_per_second,█▁▄▄▂▅▄
eval/spearman_corr,▂▆▁▇█▇▆
eval/steps_per_second,█▁▄▃▂▅▄
train/epoch,▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███

0,1
eval/cosine_sim,0.69408
eval/loss,0.46077
eval/mae,0.37292
eval/mse,0.2163
eval/pearson_corr,0.18196
eval/runtime,2.4514
eval/samples_per_second,33.042
eval/spearman_corr,0.16187
eval/steps_per_second,1.224
total_flos,0.0


In [6]:
tr.tokenizer = tokenizer
repo_id = f"minoosh/bert-reg-biencoder-{loss_fn}" 
save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-reg-biencoder-cosine_embedding...
Saving tokenizer...
Pushing to hub at minoosh/bert-reg-biencoder-cosine_embedding...


- The pipeline tag "text-similarity" is not in the official list: text-classification, token-classification, table-question-answering, question-answering, zero-shot-classification, translation, summarization, feature-extraction, text-generation, text2text-generation, fill-mask, sentence-similarity, text-to-speech, text-to-audio, automatic-speech-recognition, audio-to-audio, audio-classification, voice-activity-detection, depth-estimation, image-classification, object-detection, image-segmentation, text-to-image, image-to-text, image-to-image, image-to-video, unconditional-image-generation, video-classification, reinforcement-learning, robotics, tabular-classification, tabular-regression, tabular-to-text, table-to-text, multiple-choice, text-retrieval, time-series-forecasting, text-to-video, image-text-to-text, visual-question-answering, document-question-answering, zero-shot-image-classification, graph-ml, mask-generation, zero-shot-object-detection, text-to-3d, image-to-3d, image-feat

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-reg-biencoder-cosine_embedding


In [7]:
# Load dataset
dataset = load_dataset("minoosh/Annotated_story_pairs2")

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    #tokenizer = loaded_tokenizer
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_test.set_format(type='torch', columns=columns_to_keep)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [8]:
wandb.init()
tr.predict(tokenized_test)

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([ 0.26072887,  0.28672993,  0.64897376,  0.84406495,  0.678409  ,
        0.18588527,  0.23285288,  0.45452306,  0.07121143,  0.6681566 ,
        0.12951618,  0.4418021 ,  0.14077725,  0.5943586 , -0.00660199,
        0.12853542,  0.6531828 ,  0.22709721,  0.2340945 ,  0.19466656,
        0.4856698 ,  0.52867496,  0.0326823 ,  0.4308136 ,  0.71294254,
        0.5404089 ,  0.44032383,  0.02640997,  0.54363096,  0.28043038,
        0.39087322,  0.81582737,  0.45609728,  0.5776437 ,  0.4997825 ,
        0.63424945,  0.70435804,  0.55719954,  0.39546803,  0.16888575,
        0.01424574,  0.09749764, -0.07720552,  0.5839576 ,  0.2537427 ,
        0.31187263,  0.54344535,  0.3098923 ,  0.66727346,  0.66106015,
        0.7535709 , -0.04317653,  0.44905603,  0.2072299 ,  0.7977514 ,
        0.44362992,  0.30361336, -0.1363585 ,  0.40101823,  0.54933596,
        0.7151332 , -0.11199361,  0.41664487,  0.80006826,  0.68898124,
       -0.05778718,  0.61715233, -0

# save and push to hub

In [3]:
import os
import json
from huggingface_hub import HfApi
from transformers import AutoModel, AutoConfig, AutoTokenizer, BertConfig

def save_and_push_to_hub(trainer, repo_id, token=None):
    """
    Save and push Regression BiEncoder model to Hugging Face Hub
    
    Args:
        trainer: Trainer instance containing the model
        repo_id: String like 'username/model-name'
        token: Optional Hugging Face token
    """
    api = HfApi()
    
    try:
        temp_save_path = f"temp_save_{repo_id.split('/')[-1]}"
        os.makedirs(temp_save_path, exist_ok=True)
        
        print(f"Saving model to {temp_save_path}...")
        
        # 1. Save base model configuration
        base_config = trainer.model.base_model.config.to_dict()
        base_config.update({
            "model_type": "bert",
            "architectures": ["BiEncoderModelRegression"],
            "loss_fn": trainer.model.loss_fn,
            "task_type": "regression",
            "is_regression": True
        })
        
        with open(os.path.join(temp_save_path, "config.json"), 'w') as f:
            json.dump(base_config, f)
            
        # 2. Save complete model weights
        torch.save(trainer.model.state_dict(), os.path.join(temp_save_path, "pytorch_model.bin"))
        
        # 3. Save tokenizer
        print("Saving tokenizer...")
        tokenizer.save_pretrained(temp_save_path)
        
        # 4. Save model code
        model_code = """
import torch
from transformers import PreTrainedModel

class BiEncoderModelRegression(torch.nn.Module):
    def __init__(self, base_model, config=None, loss_fn="mse"):
        super().__init__()
        self.base_model = base_model
        self.cos = torch.nn.CosineSimilarity(dim=1)
        self.loss_fn = loss_fn
        self.config = config

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)
        
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]
        
        cos_sim = self.cos(cls_embedding_text1, cls_embedding_text2)
        
        loss = None
        if labels is not None:
            if self.loss_fn == "mse":
                loss_fct = torch.nn.MSELoss()
            elif self.loss_fn == "mae":
                loss_fct = torch.nn.L1Loss()
            elif self.loss_fn == "cosine_embedding":
                loss_fct = torch.nn.CosineEmbeddingLoss()
                labels_cosine = 2 * (labels > 0.5).float() - 1
                return {"loss": loss_fct(cls_embedding_text1, cls_embedding_text2, labels_cosine), "logits": cos_sim}
            
            loss = loss_fct(cos_sim, labels)
            
        return {"loss": loss, "logits": cos_sim}
"""
        with open(os.path.join(temp_save_path, "modeling.py"), 'w') as f:
            f.write(model_code)
        
        # 5. Save custom collator
        collator_code = """
import torch

class BiEncoderCollator:
    def __call__(self, features):
        batch = {
            'input_ids_text1': torch.stack([f['input_ids_text1'] for f in features]),
            'attention_mask_text1': torch.stack([f['attention_mask_text1'] for f in features]),
            'input_ids_text2': torch.stack([f['input_ids_text2'] for f in features]),
            'attention_mask_text2': torch.stack([f['attention_mask_text2'] for f in features]),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.float)
        }
        return batch
"""
        with open(os.path.join(temp_save_path, "data_collator.py"), 'w') as f:
            f.write(collator_code)
        
        # 6. Create model card
        model_card = f"""---
language: en
tags:
- bert
- regression
- biencoder
- similarity
pipeline_tag: text-similarity
---

# BiEncoder Regression Model

This model is a BiEncoder architecture that outputs similarity scores between text pairs.

## Model Details
- Base Model: bert-base-uncased
- Task: Regression
- Architecture: BiEncoder with cosine similarity
- Loss Function: {trainer.model.loss_fn}

## Usage

```python
from transformers import AutoTokenizer, AutoModel
from modeling import BiEncoderModelRegression

# Load model components
tokenizer = AutoTokenizer.from_pretrained("{repo_id}")
base_model = AutoModel.from_pretrained("bert-base-uncased")
model = BiEncoderModelRegression(base_model, loss_fn="{trainer.model.loss_fn}")

# Load weights
state_dict = torch.load("pytorch_model.bin")
model.load_state_dict(state_dict)

# Prepare inputs
texts1 = ["first text"]
texts2 = ["second text"]
inputs = tokenizer(
    texts1, texts2,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Get similarity scores
outputs = model(**inputs)
similarity_scores = outputs["logits"]
```

## Metrics
The model was trained using {trainer.model.loss_fn} loss and evaluated using:
- Mean Squared Error (MSE)
- Mean Absolute Error (MAE)
- Pearson Correlation
- Spearman Correlation
- Cosine Similarity
"""
        with open(os.path.join(temp_save_path, "README.md"), 'w') as f:
            f.write(model_card)
        
        # 7. Push to hub
        print(f"Pushing to hub at {repo_id}...")
        api.upload_folder(
            folder_path=temp_save_path,
            repo_id=repo_id,
            token=token
        )
        
        print(f"Successfully pushed model to {repo_id}")
        
    except Exception as e:
        print(f"Error during push to hub: {str(e)}")
        raise
    finally:
        if os.path.exists(temp_save_path):
            import shutil
            shutil.rmtree(temp_save_path)

def load_from_hub(repo_id):
    """
    Load regression BiEncoder model from Hugging Face Hub
    """
    try:
        print(f"Loading model from {repo_id}...")
        
        # 1. Load configuration and determine loss function
        config = AutoConfig.from_pretrained(repo_id)
        loss_fn = config.loss_fn if hasattr(config, 'loss_fn') else "mse"
        
        # 2. Initialize base model
        base_model = AutoModel.from_pretrained("bert-base-uncased")
        
        # 3. Create BiEncoder model
        model = BiEncoderModel(
            base_model=base_model,
            config=config,
            loss_fn=loss_fn
        )
        
        # 4. Load weights
        state_dict = torch.hub.load_state_dict_from_url(
            f"https://huggingface.co/{repo_id}/resolve/main/pytorch_model.bin",
            map_location="cpu"
        )
        model.load_state_dict(state_dict)
        
        # 5. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(repo_id)
        
        # 6. Create trainer
        trainer = Trainer(
            model=model,
            data_collator=BiEncoderCollator(),
            compute_metrics=compute_metrics
        )
        
        print("Model loaded successfully!")
        return trainer, model, tokenizer
        
    except Exception as e:
        print(f"Error loading model from hub: {str(e)}")
        raise

In [18]:
# Save and push to hub
#repo_id = "minoosh/bert-biencoder-regression"
#save_and_push_to_hub(tr, repo_id)

# Load from hub later
#loaded_trainer, loaded_model, loaded_tokenizer = load_from_hub(repo_id)

Loading model from minoosh/rep...


config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading: "https://huggingface.co/minoosh/rep/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/pytorch_model.bin
100%|██████████| 418M/418M [00:10<00:00, 42.3MB/s] 


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Model loaded successfully!
