In [None]:
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following de

In [None]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Define a custom BiEncoder model with options for different loss functions
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, num_classes=4, loss_fn="cross_entropy"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.config = config  # Add this line to set the config attribute
        self.classifier = torch.nn.Linear(base_model.config.hidden_size * 2, num_classes)  # Updated for 4 classes
        self.loss_fn = loss_fn

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)
        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]

        # Concatenate embeddings and apply classifier
        concatenated_embeddings = torch.cat([cls_embedding_text1, cls_embedding_text2], dim=1)
        logits = self.classifier(concatenated_embeddings)

        loss = None
        if labels is not None:
            if self.loss_fn == "cross_entropy":
                loss_fct = torch.nn.CrossEntropyLoss()  # Cross-entropy loss for classification
                loss = loss_fct(logits, labels)
            elif self.loss_fn == "focal_loss":
                # Focal loss implementation
                alpha = 0.25
                gamma = 2.0
                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)
                pt = torch.exp(-ce_loss)  # Probability of the true class
                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()
            elif self.loss_fn == "kl_divergence":
                # KL Divergence for soft-label classification
                kl_div = torch.nn.KLDivLoss(reduction="batchmean")
                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.classifier.out_features).float()
                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                loss = kl_div(log_probs, soft_labels)
            else:
                raise ValueError(f"Unsupported loss function: {self.loss_fn}")

        return {"loss": loss, "logits": logits}

In [None]:
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']

In [None]:
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr


def predict_test_set(trainer, test_dataset):
    # Get predictions
    predictions = trainer.predict(test_dataset)
    pred_logits = predictions.predictions
    pred_labels = np.argmax(pred_logits, axis=1)  # Get the predicted class labels
    return pred_labels, predictions.label_ids  # Return predicted and actual labels


def compute_metrics2(predictions, labels):
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)
    cosine_sim = torch.nn.functional.cosine_similarity(torch.tensor(predictions), torch.tensor(labels), dim=0).mean().item()

    return {
        "mse": mse,
        "mae": mae,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
        "cosine_sim": cosine_sim  # Optional metric for similarity tasks
    }

In [None]:
import os
import json
from huggingface_hub import HfApi
from transformers import AutoModel, AutoConfig, AutoTokenizer, BertConfig


def load_from_hub(repo_id, num_classes=4):
    """
    Load BiEncoder model from Hugging Face Hub
    """
    try:
        print(f"Loading model from {repo_id}...")

        # 1. Initialize base model with BERT
        base_model = AutoModel.from_pretrained("bert-base-uncased")

        # 2. Create BiEncoder model
        model = BiEncoderModel(
            base_model=base_model,
            num_classes=num_classes
        )

        # 3. Load state dict
        state_dict = torch.hub.load_state_dict_from_url(
            f"https://huggingface.co/{repo_id}/resolve/main/pytorch_model.bin",
            map_location="cpu"
        )
        model.load_state_dict(state_dict)

        # 4. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(repo_id)

        # 5. Create trainer
        trainer = Trainer(
            model=model,
            data_collator=BiEncoderCollator(),
            compute_metrics=compute_metrics
        )

        print("Model loaded successfully!")
        return trainer, model, tokenizer

    except Exception as e:
        print(f"Error loading model from hub: {str(e)}")
        raise

# "

In [None]:
from datasets import load_dataset

# Load test dataset
dataset = load_dataset("minoosh/EPITOME_pairs")


# Tokenize the test set
tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Set the format for the test dataset for PyTorch
tokenized_test.set_format(type='torch', columns=columns_to_keep)

In [None]:
model = "repo_id"
loaded_trainer, loaded_model, loaded_tokenizer = load_from_hub()

In [None]:
pred_labels, true_labels = predict_test_set(loaded_trainer, tokenized_test)

compute_metrics2(preds, dataset['test']['label'])

# ""

# ""