In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token hf_

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import torch

from datasets import load_dataset

from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer

from transformers import BertConfig, BertModel

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import wandb

import numpy as np



# Initialize wandb

wandb.init(

    project="bert-biencoder-classification"

)



# Load dataset

dataset = load_dataset("minoosh/EPITOME_pairs")



# Initialize bi-encoder model (e.g., BERT as a sentence encoder)

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModel.from_pretrained(model_name)



# Tokenize both text1 and text2 independently

def preprocess_function(examples):

    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)

    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)

    return {

        'input_ids_text1': text1_encodings['input_ids'],

        'attention_mask_text1': text1_encodings['attention_mask'],

        'input_ids_text2': text2_encodings['input_ids'],

        'attention_mask_text2': text2_encodings['attention_mask'],

        'labels': examples['label']

    }



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Remove unnecessary columns and set format for PyTorch

columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']

tokenized_train.set_format(type='torch', columns=columns_to_keep)

tokenized_test.set_format(type='torch', columns=columns_to_keep)

tokenized_val.set_format(type='torch', columns=columns_to_keep)



# Define a custom collator to handle text1 and text2 encoding

class BiEncoderCollator:

    def __call__(self, features):

        batch = {

            'input_ids_text1': torch.nn.utils.rnn.pad_sequence(

                [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),

            'attention_mask_text1': torch.nn.utils.rnn.pad_sequence(

                [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),

            'input_ids_text2': torch.nn.utils.rnn.pad_sequence(

                [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),

            'attention_mask_text2': torch.nn.utils.rnn.pad_sequence(

                [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),

            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.long)  # Change to long for classification

        }

        return batch



collator = BiEncoderCollator()



# Define the compute_metrics function for classification with precision and recall

def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    preds = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, preds)

    f1 = f1_score(labels, preds, average="weighted")

    precision = precision_score(labels, preds, average="weighted")

    recall = recall_score(labels, preds, average="weighted")

    return {

        "accuracy": accuracy,

        "f1": f1,

        "precision": precision,

        "recall": recall,

    }



# Define a custom BiEncoder model with options for different loss functions

class BiEncoderModel(torch.nn.Module):

    def __init__(self, base_model, config=None, num_classes=4, loss_fn="cross_entropy"):

        super(BiEncoderModel, self).__init__()

        self.base_model = base_model

        self.config = config  # Add this line to set the config attribute

        self.classifier = torch.nn.Linear(base_model.config.hidden_size * 2, num_classes)  # Updated for 4 classes

        self.loss_fn = loss_fn



    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):

        # Encode text1 and text2 separately

        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)

        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)



        # Extract [CLS] token embeddings (first token)

        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]

        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]



        # Concatenate embeddings and apply classifier

        concatenated_embeddings = torch.cat([cls_embedding_text1, cls_embedding_text2], dim=1)

        logits = self.classifier(concatenated_embeddings)



        loss = None

        if labels is not None:

            if self.loss_fn == "cross_entropy":

                loss_fct = torch.nn.CrossEntropyLoss()  # Cross-entropy loss for classification

                loss = loss_fct(logits, labels)

            elif self.loss_fn == "focal_loss":

                # Focal loss implementation

                alpha = 0.25

                gamma = 2.0

                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)

                pt = torch.exp(-ce_loss)  # Probability of the true class

                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()

            elif self.loss_fn == "kl_divergence":

                # KL Divergence for soft-label classification

                kl_div = torch.nn.KLDivLoss(reduction="batchmean")

                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.classifier.out_features).float()

                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

                loss = kl_div(log_probs, soft_labels)

            else:

                raise ValueError(f"Unsupported loss function: {self.loss_fn}")



        return {"loss": loss, "logits": logits}



# Initialize the Bi-Encoder model with specified loss function

def train_biencoder(loss_fn="cross_entropy"):

    # Load pre-trained BERT configuration and model

    config = BertConfig.from_pretrained(model_name)

    bert_model = BertModel.from_pretrained(model_name)



    # Initialize your custom BiEncoderModel with the BERT model, config, and loss function

    bi_encoder_model = BiEncoderModel(base_model=bert_model, config=config, loss_fn=loss_fn)



    # Define TrainingArguments

    training_args = TrainingArguments(

        output_dir=f"./output/bert-clf-biencoder-{loss_fn}",

        evaluation_strategy="epoch",    # Evaluate at the end of each epoch

        logging_dir='./logs',           # Directory for logs

        logging_steps=10,               # Log every 10 steps

        per_device_train_batch_size=wandb.config['batch_size'],

        per_device_eval_batch_size=wandb.config['batch_size'],

        num_train_epochs=wandb.config['epochs'],

        warmup_steps=100,

        learning_rate=wandb.config['learning_rate'],

        weight_decay=0.01,

        report_to="wandb",

        save_strategy="epoch",          # Save checkpoints at the end of each epoch

        load_best_model_at_end=True,

        push_to_hub=True,

        save_total_limit=2              # Keep only the 2 most recent checkpoints

    )



    # Define the Trainer

    trainer = Trainer(

        model=bi_encoder_model,             # Custom BiEncoder model

        args=training_args,                 # Training arguments

        train_dataset=tokenized_train,      # Training dataset

        eval_dataset=tokenized_val,         # Validation dataset

        data_collator=collator,             # Custom collator for handling bi-encoder inputs

        compute_metrics=compute_metrics     # Function to compute metrics

    )



    # Train the model

    trainer.train()



    # Evaluate the model on the test set

    #trainer.evaluate(tokenized_test)



    #trainer.model = trainer.model.base_model



    # Save and push the model to the Hugging Face Hub

    trainer.save_model(f"./output/bert-clf-biencoder-{loss_fn}")

    trainer.push_to_hub(f"minoosh/bert-clf-biencoder-{loss_fn}")



    # Finish wandb run

    wandb.finish()



    return trainer

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112502588889583, max=1.0…

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/88.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/309 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

# 0

In [5]:
# Start training with classification setup and selected loss function

loss_fns = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_fns[0]

wandb.init(project="bert-biencoder-classification", name=f"bert-biencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1754,1.059528,0.595469,0.578422,0.595608,0.595469
2,0.8707,0.863345,0.650485,0.642537,0.664873,0.650485
3,0.6367,0.829994,0.68932,0.693869,0.708117,0.68932
4,0.5392,0.842188,0.68932,0.69029,0.697068,0.68932
5,0.3485,0.875202,0.68932,0.689807,0.69244,0.68932
6,0.2629,0.930244,0.679612,0.679704,0.679968,0.679612
7,0.1981,0.963187,0.666667,0.667124,0.669312,0.666667


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅███▇▆
eval/f1,▁▅███▇▆
eval/loss,█▂▁▁▂▄▅
eval/precision,▁▅█▇▇▆▆
eval/recall,▁▅███▇▆
eval/runtime,▁▃▄▄█▄▅
eval/samples_per_second,█▆▅▄▁▄▄
eval/steps_per_second,█▆▅▄▁▄▄
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.66667
eval/f1,0.66712
eval/loss,0.96319
eval/precision,0.66931
eval/recall,0.66667
eval/runtime,7.221
eval/samples_per_second,42.792
eval/steps_per_second,1.385
total_flos,0.0
train/epoch,7.0


In [6]:
tr.tokenizer = tokenizer



# To save and push to hub:

repo_id = f"minoosh/bert-clf-biencoder-{loss_fn}"  

save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-clf-biencoder-cross_entropy...
Saving tokenizer...
Pushing to hub at minoosh/bert-clf-biencoder-cross_entropy...


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-clf-biencoder-cross_entropy


In [8]:
wandb.init()

preds = tr.predict(tokenized_test)
preds

VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
test/accuracy,▁
test/f1,▁
test/loss,▁
test/precision,▁
test/recall,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁

0,1
test/accuracy,0.64286
test/f1,0.64267
test/loss,0.93339
test/precision,0.65018
test/recall,0.64286
test/runtime,7.3686
test/samples_per_second,41.799
test/steps_per_second,1.357


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([[ 0.47875184, -0.7571652 ,  3.1247444 , -1.6508647 ],
       [ 0.6576831 ,  2.9659288 , -0.61126363, -2.3105402 ],
       [ 2.7631958 ,  0.03781628, -0.4214085 , -1.7669917 ],
       ...,
       [ 1.9563907 ,  0.5080263 ,  0.912665  , -2.8388667 ],
       [ 0.47819823, -0.4523617 ,  1.0313622 ,  0.24573353],
       [ 2.052886  ,  0.44165656, -0.18419163, -3.007797  ]],
      dtype=float32), label_ids=array([0, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 2, 0, 3,
       2, 1, 3, 0, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 2, 3, 1, 2, 2, 3,
       0, 1, 2, 2, 0, 2, 2, 3, 3, 2, 1, 3, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0,
       0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 3, 2, 1, 0, 1, 0, 2, 0, 2, 2, 1,
       2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 3, 2, 2, 1, 3, 2, 1, 1, 2, 3,
       3, 3, 3, 2, 1, 2, 0, 3, 0, 3, 3, 3, 3, 2, 2, 0, 0, 2, 1, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 1, 0, 0, 1, 

In [12]:
preds.predictions.argmax(axis=1)

array([2, 1, 0, 0, 1, 1, 0, 3, 0, 2, 0, 0, 2, 2, 0, 0, 0, 1, 1, 2, 0, 1,
       2, 1, 1, 1, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 0, 0, 1, 1, 2, 2,
       1, 1, 2, 2, 2, 2, 0, 2, 1, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 1, 0,
       0, 2, 1, 1, 0, 0, 3, 1, 2, 1, 2, 3, 1, 0, 1, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 0, 2, 2, 2, 3, 0, 1, 2, 1, 3, 2, 0, 0, 1, 2, 1, 2, 2, 1,
       1, 1, 3, 2, 1, 2, 0, 2, 0, 3, 3, 2, 1, 2, 2, 0, 0, 2, 2, 2, 1, 0,
       1, 0, 2, 2, 2, 2, 1, 1, 2, 2, 0, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 3, 2, 2, 3, 1, 1, 0, 1, 0, 0, 2, 2, 3, 3, 1, 2, 2, 3, 0,
       1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 3, 2, 2, 1, 0, 2, 2, 3, 2, 1, 1, 2,
       2, 2, 3, 1, 2, 2, 0, 1, 2, 0, 0, 2, 0, 1, 2, 0, 3, 2, 1, 2, 0, 2,
       1, 1, 0, 1, 1, 2, 1, 2, 0, 3, 2, 2, 3, 2, 2, 0, 1, 0, 0, 2, 2, 0,
       1, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1, 1, 2, 3, 0, 1, 1, 3, 3, 1, 2, 1,
       0, 1, 2, 2, 3, 2, 1, 1, 1, 2, 2, 3, 3, 1, 2, 0, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 3, 2, 0, 3, 0, 0, 2, 2, 1, 3, 3, 3,

# 1

In [5]:
# Start training with classification setup and selected loss function

loss_fns = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_fns[1]

wandb.init(project="bert-biencoder-classification", name=f"bert-biencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1571,0.133447,0.576052,0.559657,0.584601,0.576052
2,0.1019,0.100859,0.647249,0.639938,0.66601,0.647249
3,0.0711,0.092316,0.686084,0.68686,0.690961,0.686084
4,0.0475,0.09719,0.660194,0.661132,0.674644,0.660194
5,0.0258,0.108519,0.660194,0.659646,0.669853,0.660194
6,0.0188,0.109882,0.66343,0.662647,0.666683,0.66343
7,0.0155,0.113551,0.660194,0.65959,0.664228,0.660194


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆█▆▆▇▆
eval/f1,▁▅█▇▇▇▆
eval/loss,█▂▁▂▄▄▅
eval/precision,▁▆█▇▇▆▆
eval/recall,▁▆█▆▆▇▆
eval/runtime,█▅▆▅▁█▁
eval/samples_per_second,▁▄▃▄█▁█
eval/steps_per_second,▁▄▃▄█▂█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.66019
eval/f1,0.65959
eval/loss,0.11355
eval/precision,0.66423
eval/recall,0.66019
eval/runtime,7.5282
eval/samples_per_second,41.045
eval/steps_per_second,1.328
total_flos,0.0
train/epoch,7.0


In [6]:
tr.tokenizer = tokenizer



# To save and push to hub:

repo_id = f"minoosh/bert-clf-biencoder-{loss_fn}"  

save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-clf-biencoder-focal_loss...
Saving tokenizer...
Pushing to hub at minoosh/bert-clf-biencoder-focal_loss...


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-clf-biencoder-focal_loss


In [7]:
wandb.init()

preds = tr.predict(tokenized_test)
preds

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([[ 0.4862593 , -0.89750385,  1.7197946 , -0.72526085],
       [ 0.09727408,  1.843831  , -0.56674606, -1.7101706 ],
       [ 2.0809734 ,  0.02295816,  0.5901209 , -1.7126127 ],
       ...,
       [ 1.5558326 ,  0.46905598,  0.5399975 , -1.3892521 ],
       [ 0.4268964 , -0.7688354 ,  1.0717008 ,  0.1452835 ],
       [ 1.5584437 ,  0.8424485 ,  0.29216307, -1.8913709 ]],
      dtype=float32), label_ids=array([0, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 2, 0, 3,
       2, 1, 3, 0, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 2, 3, 1, 2, 2, 3,
       0, 1, 2, 2, 0, 2, 2, 3, 3, 2, 1, 3, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0,
       0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 3, 2, 1, 0, 1, 0, 2, 0, 2, 2, 1,
       2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 3, 2, 2, 1, 3, 2, 1, 1, 2, 3,
       3, 3, 3, 2, 1, 2, 0, 3, 0, 3, 3, 3, 3, 2, 2, 0, 0, 2, 1, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 1, 0, 0, 1, 

In [8]:
preds.predictions.argmax(axis=1)

array([2, 1, 0, 0, 2, 1, 0, 3, 2, 2, 0, 0, 0, 2, 0, 0, 0, 1, 0, 2, 0, 3,
       2, 1, 1, 1, 0, 2, 3, 2, 0, 2, 0, 0, 2, 2, 0, 3, 0, 0, 1, 3, 2, 3,
       0, 1, 2, 2, 2, 2, 0, 2, 0, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0,
       1, 2, 1, 1, 0, 0, 3, 1, 2, 1, 2, 3, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 0, 2, 2, 0, 3, 0, 1, 2, 1, 3, 2, 2, 0, 1, 2, 1, 2, 2, 1,
       1, 3, 3, 2, 1, 2, 2, 2, 0, 2, 3, 2, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0,
       1, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 3, 2, 3, 0, 2, 1, 2, 2, 2, 2,
       2, 2, 1, 3, 2, 2, 3, 1, 1, 0, 1, 0, 0, 2, 2, 3, 3, 1, 2, 2, 3, 0,
       1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 3, 2, 2, 1, 0, 2, 2, 3, 2, 1, 1, 2,
       2, 2, 3, 2, 2, 0, 0, 1, 2, 1, 0, 2, 0, 3, 2, 0, 3, 2, 1, 2, 0, 2,
       1, 1, 0, 1, 1, 2, 1, 2, 0, 3, 2, 2, 3, 2, 1, 0, 1, 0, 0, 0, 2, 0,
       1, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 3, 3, 0, 1, 3, 3, 3, 3, 2, 3,
       0, 0, 2, 2, 3, 2, 1, 1, 1, 2, 2, 3, 3, 2, 2, 0, 2, 2, 2, 2, 2, 0,
       0, 0, 0, 2, 2, 0, 3, 0, 0, 2, 0, 1, 3, 3, 3,

# 2

In [5]:
# Start training with classification setup and selected loss function

loss_fns = ["cross_entropy", "focal_loss", "kl_divergence"]

loss_fn = loss_fns[2]

wandb.init(project="bert-biencoder-classification", name=f"bert-biencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})

tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1449,1.056463,0.598706,0.582026,0.606798,0.598706
2,0.8672,0.847126,0.650485,0.644873,0.66113,0.650485
3,0.6288,0.800347,0.682848,0.685784,0.693318,0.682848
4,0.5023,0.817851,0.68932,0.69112,0.700848,0.68932
5,0.332,0.860958,0.686084,0.686556,0.690745,0.686084
6,0.2637,0.907476,0.686084,0.686973,0.68905,0.686084
7,0.1895,0.946877,0.682848,0.684449,0.693678,0.682848


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅█████
eval/f1,▁▅█████
eval/loss,█▂▁▁▃▄▅
eval/precision,▁▅▇█▇▇▇
eval/recall,▁▅█████
eval/runtime,▃▁▅█▁▂▅
eval/samples_per_second,▆█▄▁█▇▄
eval/steps_per_second,▆█▄▁██▄
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████

0,1
eval/accuracy,0.68285
eval/f1,0.68445
eval/loss,0.94688
eval/precision,0.69368
eval/recall,0.68285
eval/runtime,7.3416
eval/samples_per_second,42.089
eval/steps_per_second,1.362
total_flos,0.0
train/epoch,7.0


In [6]:
tr.tokenizer = tokenizer



# To save and push to hub:

repo_id = f"minoosh/bert-clf-biencoder-{loss_fn}"  

save_and_push_to_hub(tr, repo_id)

Saving model to temp_save_bert-clf-biencoder-kl_divergence...
Saving tokenizer...
Pushing to hub at minoosh/bert-clf-biencoder-kl_divergence...


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-clf-biencoder-kl_divergence


In [7]:
wandb.init()

preds = tr.predict(tokenized_test)
preds

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


PredictionOutput(predictions=array([[ 0.39821813, -1.4689566 ,  2.45921   , -1.5977148 ],
       [ 0.42356625,  2.5593278 , -0.574579  , -1.5304163 ],
       [ 1.9899086 , -0.00462926, -0.44955316, -1.7662003 ],
       ...,
       [ 1.9871001 , -0.21687815,  0.94965726, -2.630942  ],
       [ 0.07549342, -1.3629589 ,  1.504989  , -0.8616574 ],
       [ 2.2410529 ,  1.0336057 ,  0.05006527, -2.8397596 ]],
      dtype=float32), label_ids=array([0, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 2, 0, 3,
       2, 1, 3, 0, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 2, 3, 1, 2, 2, 3,
       0, 1, 2, 2, 0, 2, 2, 3, 3, 2, 1, 3, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0,
       0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 3, 2, 1, 0, 1, 0, 2, 0, 2, 2, 1,
       2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 3, 2, 2, 1, 3, 2, 1, 1, 2, 3,
       3, 3, 3, 2, 1, 2, 0, 3, 0, 3, 3, 3, 3, 2, 2, 0, 0, 2, 1, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 1, 0, 0, 1, 

In [8]:
preds.predictions.argmax(axis=1)

array([2, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 3,
       2, 1, 1, 1, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 1, 0, 0, 1, 3, 2, 3,
       1, 1, 2, 2, 2, 2, 0, 2, 1, 2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 2, 1, 0,
       0, 2, 1, 1, 0, 0, 3, 1, 2, 1, 2, 3, 1, 0, 0, 0, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 0, 2, 2, 0, 3, 0, 1, 2, 1, 3, 2, 0, 0, 1, 2, 1, 2, 2, 0,
       1, 1, 3, 2, 1, 2, 2, 2, 0, 3, 3, 2, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0,
       1, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 3,
       2, 2, 1, 3, 2, 2, 3, 1, 0, 0, 1, 0, 0, 2, 2, 3, 3, 0, 2, 2, 3, 0,
       1, 2, 2, 2, 1, 2, 3, 2, 2, 2, 3, 2, 2, 1, 1, 2, 2, 3, 2, 1, 1, 2,
       2, 2, 3, 1, 2, 2, 0, 1, 2, 0, 0, 2, 0, 1, 2, 0, 3, 2, 1, 2, 0, 2,
       1, 1, 0, 1, 1, 2, 1, 2, 0, 3, 2, 2, 3, 2, 2, 0, 1, 0, 0, 2, 3, 0,
       1, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1, 1, 2, 3, 0, 1, 1, 3, 3, 3, 2, 1,
       0, 0, 2, 2, 3, 2, 1, 1, 1, 2, 2, 3, 3, 1, 3, 0, 2, 2, 2, 2, 2, 1,
       1, 0, 0, 3, 2, 0, 3, 0, 0, 2, 2, 1, 1, 3, 3,

# Save and Push to hub

In [3]:
import os

import json

from huggingface_hub import HfApi

from transformers import AutoModel, AutoConfig, AutoTokenizer, BertConfig



def save_and_push_to_hub(trainer, repo_id, token=None):

    """

    Save and push BiEncoder model to Hugging Face Hub

    """

    api = HfApi()

    

    try:

        temp_save_path = f"temp_save_{repo_id.split('/')[-1]}"

        os.makedirs(temp_save_path, exist_ok=True)

        

        print(f"Saving model to {temp_save_path}...")

        

        # 1. Save the base model configuration

        base_config = trainer.model.base_model.config.to_dict()

        base_config["model_type"] = "bert"  # Ensure we're using BERT as base

        base_config["architectures"] = ["BertModel"]

        

        with open(os.path.join(temp_save_path, "config.json"), 'w') as f:

            json.dump(base_config, f)

            

        # 2. Save model weights

        torch.save(trainer.model.state_dict(), os.path.join(temp_save_path, "pytorch_model.bin"))

        

        # 3. Save tokenizer

        print("Saving tokenizer...")

        if hasattr(trainer, 'tokenizer'):

            trainer.tokenizer.save_pretrained(temp_save_path)

        

        # 4. Create model card

        model_card = f"""---

language: en

tags:

- bert

- classification

- pytorch

pipeline_tag: text-classification

---



# BiEncoder Classification Model



This model is a BiEncoder architecture based on BERT for text pair classification.



## Model Details

- Base Model: bert-base-uncased

- Architecture: BiEncoder with BERT base

- Number of classes: {trainer.model.classifier.out_features}



## Usage



```python

from transformers import AutoTokenizer

import torch



# Load tokenizer

tokenizer = AutoTokenizer.from_pretrained("{repo_id}")



# Load model weights

state_dict = torch.load("pytorch_model.bin")



# Initialize model (you'll need the BiEncoderModel class)

model = BiEncoderModel(

    base_model=AutoModel.from_pretrained("bert-base-uncased"),

    num_classes={trainer.model.classifier.out_features}

)

model.load_state_dict(state_dict)

```

"""

        with open(os.path.join(temp_save_path, "README.md"), 'w') as f:

            f.write(model_card)

        

        # 5. Push to hub

        print(f"Pushing to hub at {repo_id}...")

        api.upload_folder(

            folder_path=temp_save_path,

            repo_id=repo_id,

            token=token

        )

        

        print(f"Successfully pushed model to {repo_id}")

        

    except Exception as e:

        print(f"Error during push to hub: {str(e)}")

        raise

    finally:

        if os.path.exists(temp_save_path):

            import shutil

            shutil.rmtree(temp_save_path)



def load_from_hub(repo_id, num_classes=4):

    """

    Load BiEncoder model from Hugging Face Hub

    """

    try:

        print(f"Loading model from {repo_id}...")

        

        # 1. Initialize base model with BERT

        base_model = AutoModel.from_pretrained("bert-base-uncased")

        

        # 2. Create BiEncoder model

        model = BiEncoderModel(

            base_model=base_model,

            num_classes=num_classes

        )

        

        # 3. Load state dict

        state_dict = torch.hub.load_state_dict_from_url(

            f"https://huggingface.co/{repo_id}/resolve/main/pytorch_model.bin",

            map_location="cpu"

        )

        model.load_state_dict(state_dict)

        

        # 4. Load tokenizer

        tokenizer = AutoTokenizer.from_pretrained(repo_id)

        

        # 5. Create trainer

        trainer = Trainer(

            model=model,

            data_collator=BiEncoderCollator(),

            compute_metrics=compute_metrics

        )

        

        print("Model loaded successfully!")

        return trainer, model, tokenizer

        

    except Exception as e:

        print(f"Error loading model from hub: {str(e)}")

        raise

In [8]:
# To save and push to hub:

repo_id = f"minoosh/bert-clf-biencoder-{loss_fn}"  

save_and_push_to_hub(tr, repo_id)



# To load from hub later:

#loaded_trainer, loaded_model, loaded_tokenizer = load_from_hub(repo_id)

Saving model to temp_save_bert-clf-biencoder-cross_entropy...
Saving tokenizer...
Pushing to hub at minoosh/bert-clf-biencoder-cross_entropy...


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Successfully pushed model to minoosh/bert-clf-biencoder-cross_entropy
