In [1]:
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m

In [48]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [62]:
import torch
# Define a custom BiEncoder model with options for different loss functions
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, num_classes=4, loss_fn="cross_entropy"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.config = config  # Add this line to set the config attribute
        self.classifier = torch.nn.Linear(base_model.config.hidden_size * 2, num_classes)  # Updated for 4 classes
        self.loss_fn = loss_fn

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)
        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]

        # Concatenate embeddings and apply classifier
        concatenated_embeddings = torch.cat([cls_embedding_text1, cls_embedding_text2], dim=1)
        logits = self.classifier(concatenated_embeddings)

        loss = None
        if labels is not None:
            if self.loss_fn == "cross_entropy":
                loss_fct = torch.nn.CrossEntropyLoss()  # Cross-entropy loss for classification
                loss = loss_fct(logits, labels)
            elif self.loss_fn == "focal_loss":
                # Focal loss implementation
                alpha = 0.25
                gamma = 2.0
                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)
                pt = torch.exp(-ce_loss)  # Probability of the true class
                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()
            elif self.loss_fn == "kl_divergence":
                # KL Divergence for soft-label classification
                kl_div = torch.nn.KLDivLoss(reduction="batchmean")
                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.classifier.out_features).float()
                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                loss = kl_div(log_probs, soft_labels)
            else:
                raise ValueError(f"Unsupported loss function: {self.loss_fn}")

        return {"loss": loss, "logits": logits}

In [50]:
class BiEncoderCollator:
    def __call__(self, features):
        batch = {
            'input_ids_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
            'attention_mask_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
            'input_ids_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
            'attention_mask_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.long)  # Change to long for classification
        }
        return batch

collator = BiEncoderCollator()

In [51]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics2(labels, preds):
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [52]:
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

# "

In [106]:
from datasets import load_dataset

# Load test dataset
dataset = load_dataset("minoosh/EPITOME_pairs")

# Tokenize the test set
tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Set the format for the test dataset for PyTorch
tokenized_test.set_format(type='torch', columns=columns_to_keep)

In [115]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer, Trainer

model_name = "minoosh/tmp_trainer"

# Load the tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

bi_encoder_model = BiEncoderModel(base_model=model, config=config)

trainer = Trainer(
        model=loaded_model,
        data_collator=collator,# Custom collator for handling bi-encoder inputs
    )

Some weights of the model checkpoint at minoosh/tmp_trainer were not used when initializing BertModel: ['base_model.embeddings.LayerNorm.bias', 'base_model.embeddings.LayerNorm.weight', 'base_model.embeddings.position_embeddings.weight', 'base_model.embeddings.token_type_embeddings.weight', 'base_model.embeddings.word_embeddings.weight', 'base_model.encoder.layer.0.attention.output.LayerNorm.bias', 'base_model.encoder.layer.0.attention.output.LayerNorm.weight', 'base_model.encoder.layer.0.attention.output.dense.bias', 'base_model.encoder.layer.0.attention.output.dense.weight', 'base_model.encoder.layer.0.attention.self.key.bias', 'base_model.encoder.layer.0.attention.self.key.weight', 'base_model.encoder.layer.0.attention.self.query.bias', 'base_model.encoder.layer.0.attention.self.query.weight', 'base_model.encoder.layer.0.attention.self.value.bias', 'base_model.encoder.layer.0.attention.self.value.weight', 'base_model.encoder.layer.0.intermediate.dense.bias', 'base_model.encoder.laye

In [116]:
import numpy as np

def predict_test_set(trainer, test_dataset):
    # Get predictions
    predictions = trainer.predict(test_dataset)
    pred_logits = predictions.predictions
    pred_labels = np.argmax(pred_logits, axis=1)  # Get the predicted class labels
    return pred_labels, predictions.label_ids  # Return predicted and actual labels

# Example usage after training
#trainer = train_biencoder(loss_fn="cross_entropy")  # Train the model first
pred_labels, true_labels = predict_test_set(trainer, tokenized_test)

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),


In [117]:
compute_metrics2(pred_labels, true_labels)

{'accuracy': 0.6461038961038961,
 'f1': 0.6501035994774385,
 'precision': 0.6706513643205454,
 'recall': 0.6461038961038961}

# ""

# ""

In [113]:
import os
import json
from huggingface_hub import HfApi
from transformers import AutoModel, AutoConfig, AutoTokenizer, BertConfig

def save_and_push_to_hub(trainer, repo_id, token=None):
    """
    Save and push BiEncoder model to Hugging Face Hub
    """
    api = HfApi()

    try:
        temp_save_path = f"temp_save_{repo_id.split('/')[-1]}"
        os.makedirs(temp_save_path, exist_ok=True)

        print(f"Saving model to {temp_save_path}...")

        # 1. Save the base model configuration
        base_config = trainer.model.base_model.config.to_dict()
        base_config["model_type"] = "bert"  # Ensure we're using BERT as base
        base_config["architectures"] = ["BertModel"]

        with open(os.path.join(temp_save_path, "config.json"), 'w') as f:
            json.dump(base_config, f)

        # 2. Save model weights
        torch.save(trainer.model.state_dict(), os.path.join(temp_save_path, "pytorch_model.bin"))

        # 3. Save tokenizer
        print("Saving tokenizer...")
        if hasattr(trainer, 'tokenizer'):
            trainer.tokenizer.save_pretrained(temp_save_path)

        # 4. Create model card
        model_card = f"""---
language: en
tags:
- bert
- classification
- pytorch
pipeline_tag: text-classification
---

# BiEncoder Classification Model

This model is a BiEncoder architecture based on BERT for text pair classification.

## Model Details
- Base Model: bert-base-uncased
- Architecture: BiEncoder with BERT base
- Number of classes: {trainer.model.classifier.out_features}

## Usage

```python
from transformers import AutoTokenizer
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("{repo_id}")

# Load model weights
state_dict = torch.load("pytorch_model.bin")

# Initialize model (you'll need the BiEncoderModel class)
model = BiEncoderModel(
    base_model=AutoModel.from_pretrained("bert-base-uncased"),
    num_classes={trainer.model.classifier.out_features}
)
model.load_state_dict(state_dict)
```
"""
        with open(os.path.join(temp_save_path, "README.md"), 'w') as f:
            f.write(model_card)

        # 5. Push to hub
        print(f"Pushing to hub at {repo_id}...")
        api.upload_folder(
            folder_path=temp_save_path,
            repo_id=repo_id,
            token=token
        )

        print(f"Successfully pushed model to {repo_id}")

    except Exception as e:
        print(f"Error during push to hub: {str(e)}")
        raise
    finally:
        if os.path.exists(temp_save_path):
            import shutil
            shutil.rmtree(temp_save_path)

def load_from_hub(repo_id, num_classes=4):
    """
    Load BiEncoder model from Hugging Face Hub
    """
    try:
        print(f"Loading model from {repo_id}...")

        # 1. Initialize base model with BERT
        base_model = AutoModel.from_pretrained("bert-base-uncased")

        # 2. Create BiEncoder model
        model = BiEncoderModel(
            base_model=base_model,
            num_classes=num_classes
        )

        # 3. Load state dict
        state_dict = torch.hub.load_state_dict_from_url(
            f"https://huggingface.co/{repo_id}/resolve/main/pytorch_model.bin",
            map_location="cpu"
        )
        model.load_state_dict(state_dict)

        # 4. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(repo_id)

        # 5. Create trainer
        trainer = Trainer(
            model=model,
            data_collator=BiEncoderCollator(),
            # compute_metrics=compute_metrics
        )

        print("Model loaded successfully!")
        return trainer, model, tokenizer

    except Exception as e:
        print(f"Error loading model from hub: {str(e)}")
        raise

In [114]:
repo_id = "minoosh/repo"  # e.g., "minoosh/bert-biencoder-classification"

# To load from hub later:
loaded_trainer, loaded_model, loaded_tokenizer = load_from_hub(repo_id)

Loading model from minoosh/repo...
Model loaded successfully!
