In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
from transformers import BertConfig, BertModel
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import wandb
import numpy as np

# Initialize wandb
wandb.init(
    project="bert-biencoder-classification"
)

# Load dataset
dataset = load_dataset("minoosh/EPITOME_pairs")

# Initialize bi-encoder model (e.g., BERT as a sentence encoder)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

# Tokenize both text1 and text2 independently
def preprocess_function(examples):
    text1_encodings = tokenizer(examples['text1'], truncation=True, padding=True, max_length=512)
    text2_encodings = tokenizer(examples['text2'], truncation=True, padding=True, max_length=512)
    return {
        'input_ids_text1': text1_encodings['input_ids'],
        'attention_mask_text1': text1_encodings['attention_mask'],
        'input_ids_text2': text2_encodings['input_ids'],
        'attention_mask_text2': text2_encodings['attention_mask'],
        'labels': examples['label']
    }

# Apply tokenization
tokenized_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_test = dataset['test'].map(preprocess_function, batched=True)
tokenized_val = dataset['validation'].map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
columns_to_keep = ['input_ids_text1', 'attention_mask_text1', 'input_ids_text2', 'attention_mask_text2', 'labels']
tokenized_train.set_format(type='torch', columns=columns_to_keep)
tokenized_test.set_format(type='torch', columns=columns_to_keep)
tokenized_val.set_format(type='torch', columns=columns_to_keep)

# Define a custom collator to handle text1 and text2 encoding
class BiEncoderCollator:
    def __call__(self, features):
        batch = {
            'input_ids_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
            'attention_mask_text1': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
            'input_ids_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
            'attention_mask_text2': torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.long)  # Change to long for classification
        }
        return batch

collator = BiEncoderCollator()

# Define the compute_metrics function for classification with precision and recall
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define a custom BiEncoder model with options for different loss functions
class BiEncoderModel(torch.nn.Module):
    def __init__(self, base_model, config=None, num_classes=4, loss_fn="cross_entropy"):
        super(BiEncoderModel, self).__init__()
        self.base_model = base_model
        self.config = config  # Add this line to set the config attribute
        self.classifier = torch.nn.Linear(base_model.config.hidden_size * 2, num_classes)  # Updated for 4 classes
        self.loss_fn = loss_fn

    def forward(self, input_ids_text1, attention_mask_text1, input_ids_text2, attention_mask_text2, labels=None):
        # Encode text1 and text2 separately
        outputs_text1 = self.base_model(input_ids_text1, attention_mask=attention_mask_text1)
        outputs_text2 = self.base_model(input_ids_text2, attention_mask=attention_mask_text2)

        # Extract [CLS] token embeddings (first token)
        cls_embedding_text1 = outputs_text1.last_hidden_state[:, 0, :]
        cls_embedding_text2 = outputs_text2.last_hidden_state[:, 0, :]

        # Concatenate embeddings and apply classifier
        concatenated_embeddings = torch.cat([cls_embedding_text1, cls_embedding_text2], dim=1)
        logits = self.classifier(concatenated_embeddings)

        loss = None
        if labels is not None:
            if self.loss_fn == "cross_entropy":
                loss_fct = torch.nn.CrossEntropyLoss()  # Cross-entropy loss for classification
                loss = loss_fct(logits, labels)
            elif self.loss_fn == "focal_loss":
                # Focal loss implementation
                alpha = 0.25
                gamma = 2.0
                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)
                pt = torch.exp(-ce_loss)  # Probability of the true class
                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()
            elif self.loss_fn == "kl_divergence":
                # KL Divergence for soft-label classification
                kl_div = torch.nn.KLDivLoss(reduction="batchmean")
                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.classifier.out_features).float()
                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
                loss = kl_div(log_probs, soft_labels)
            else:
                raise ValueError(f"Unsupported loss function: {self.loss_fn}")

        return {"loss": loss, "logits": logits}

# Initialize the Bi-Encoder model with specified loss function
def train_biencoder(loss_fn="cross_entropy"):
    # Load pre-trained BERT configuration and model
    config = BertConfig.from_pretrained(model_name)
    bert_model = BertModel.from_pretrained(model_name)

    # Initialize your custom BiEncoderModel with the BERT model, config, and loss function
    bi_encoder_model = BiEncoderModel(base_model=bert_model, config=config, loss_fn=loss_fn)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir=f"./output/bert-clf-biencoder-{loss_fn}",
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        logging_dir='./logs',           # Directory for logs
        logging_steps=10,               # Log every 10 steps
        per_device_train_batch_size=wandb.config['batch_size'],
        per_device_eval_batch_size=wandb.config['batch_size'],
        num_train_epochs=wandb.config['epochs'],
        warmup_steps=100,
        learning_rate=wandb.config['learning_rate'],
        weight_decay=0.01,
        report_to="wandb",
        save_strategy="epoch",          # Save checkpoints at the end of each epoch
        load_best_model_at_end=True,
        push_to_hub=True,
        save_total_limit=2              # Keep only the 2 most recent checkpoints
    )

    # Define the Trainer
    trainer = Trainer(
        model=bi_encoder_model,             # Custom BiEncoder model
        args=training_args,                 # Training arguments
        train_dataset=tokenized_train,      # Training dataset
        eval_dataset=tokenized_val,         # Validation dataset
        data_collator=collator,             # Custom collator for handling bi-encoder inputs
        compute_metrics=compute_metrics     # Function to compute metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the test set
    #trainer.evaluate(tokenized_test)

    #trainer.model = trainer.model.base_model

    # Save and push the model to the Hugging Face Hub
    trainer.save_model(f"./output/bert-clf-biencoder-{loss_fn}")
    trainer.push_to_hub(f"minoosh/bert-clf-biencoder-{loss_fn}")

    # Finish wandb run
    wandb.finish()

    return trainer

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113025866666248, max=1.0…

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/88.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/309 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [4]:
# Start training with classification setup and selected loss function
loss_fns = ["cross_entropy", "focal_loss", "kl_divergence"]
loss_fn = loss_fns[0]
wandb.init(project="bert-biencoder-classification", name=f"bert-biencoder-classification-{loss_fn}", config={"epochs": 7, "batch_size": 16, "learning_rate": 2e-5})
tr = train_biencoder(loss_fn)

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1839,1.077525,0.579288,0.553207,0.604092,0.579288
2,0.8361,0.855902,0.660194,0.652314,0.670091,0.660194
3,0.6516,0.85755,0.644013,0.648457,0.658787,0.644013
4,0.5243,0.906198,0.640777,0.640736,0.650274,0.640777
5,0.3446,0.939663,0.647249,0.645739,0.652287,0.647249
6,0.2808,0.974774,0.634304,0.634407,0.635573,0.634304
7,0.1982,1.014879,0.631068,0.631045,0.634338,0.631068


  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.te

VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁█▇▆▇▆▅
eval/f1,▁██▇█▇▆
eval/loss,█▁▁▃▄▅▆
eval/precision,▁█▇▆▆▄▄
eval/recall,▁█▇▆▇▆▅
eval/runtime,▅▄▄▁█▁▁
eval/samples_per_second,▄▅▄▇▁██
eval/steps_per_second,▄▅▅█▁██
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇██

0,1
eval/accuracy,0.63107
eval/f1,0.63105
eval/loss,1.01488
eval/precision,0.63434
eval/recall,0.63107
eval/runtime,7.2224
eval/samples_per_second,42.784
eval/steps_per_second,1.385
total_flos,0.0
train/epoch,7.0


In [26]:
tr.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/minoosh/bert-clf-biencoder-cross_entropy/commit/ecebb764b69e70fa600211415fe638f27734ebb9', commit_message='End of training', commit_description='', oid='ecebb764b69e70fa600211415fe638f27734ebb9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minoosh/bert-clf-biencoder-cross_entropy', endpoint='https://huggingface.co', repo_type='model', repo_id='minoosh/bert-clf-biencoder-cross_entropy'), pr_revision=None, pr_num=None)

In [5]:
tr

<transformers.trainer.Trainer at 0x7ad084279d50>

In [7]:
wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mminooshayan97[0m ([33mminoosh[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
# Tokenize the test set
tokenized_test = dataset['test'].map(preprocess_function, batched=True)

# Set the format for the test dataset for PyTorch
tokenized_test.set_format(type='torch', columns=columns_to_keep)

# Predict on the test set after training
def predict_test_set(trainer, test_dataset):
    # Get predictions
    predictions = trainer.predict(test_dataset)
    pred_logits = predictions.predictions
    pred_labels = np.argmax(pred_logits, axis=1)  # Get the predicted class labels
    return pred_labels, predictions.label_ids  # Return predicted and actual labels

# Example usage after training
#trainer = train_biencoder(loss_fn="cross_entropy")  # Train the model first
pred_labels, true_labels = predict_test_set(tr, tokenized_test)

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [9]:
# Optionally, calculate metrics on test set predictions
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average="weighted")
precision = precision_score(true_labels, pred_labels, average="weighted")
recall = recall_score(true_labels, pred_labels, average="weighted")

# Print metrics
print("Test Set Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Optionally save the predictions in a CSV file
import pandas as pd
df_predictions = pd.DataFrame({"True Labels": true_labels, "Predicted Labels": pred_labels})
df_predictions.to_csv("test_predictions.csv", index=False)

Test Set Evaluation:
Accuracy: 0.6461038961038961
F1 Score: 0.6421041927303539
Precision: 0.6614733854189244
Recall: 0.6461038961038961


In [22]:
pred_labels, true_labels = predict_test_set(tr, tokenized_val)

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [23]:
# Optionally, calculate metrics on test set predictions
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average="weighted")
precision = precision_score(true_labels, pred_labels, average="weighted")
recall = recall_score(true_labels, pred_labels, average="weighted")

# Print metrics
print("Test Set Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

# Optionally save the predictions in a CSV file
import pandas as pd
df_predictions = pd.DataFrame({"True Labels": true_labels, "Predicted Labels": pred_labels})
df_predictions.to_csv("test_predictions.csv", index=False)

Test Set Evaluation:
Accuracy: 0.6601941747572816
F1 Score: 0.6523140839405329
Precision: 0.6700914000695115
Recall: 0.6601941747572816


In [114]:
tr.model.save_pretrained("MOOO5")

AttributeError: 'BiEncoderModel' object has no attribute 'save_pretrained'

In [157]:
bi_encoder_model = BiEncoderModel(base_model=tr.model.base_model, config=config)

In [176]:
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer, Trainer
 
model_name = "/kaggle/working/This"

# Load the tokenizer and model from Hugging Face
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = tokenizer
config = AutoConfig.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

bi_encoder_model = BiEncoderModel(base_model=model, config=config)

trainer = Trainer(
        model=loaded_model,
        data_collator=collator,# Custom collator for handling bi-encoder inputs
    )

In [180]:
import numpy as np

def predict_test_set(trainer, test_dataset):
    # Get predictions
    predictions = trainer.predict(test_dataset)
    pred_logits = predictions.predictions
    pred_labels = np.argmax(pred_logits, axis=1)  # Get the predicted class labels
    return pred_labels, predictions.label_ids  # Return predicted and actual labels

# Example usage after training
#trainer = train_biencoder(loss_fn="cross_entropy")  # Train the model first
pred_labels, true_labels = predict_test_set(loaded_trainer, tokenized_test)

  [torch.tensor(f['input_ids_text1']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text1']) for f in features], batch_first=True),
  [torch.tensor(f['input_ids_text2']) for f in features], batch_first=True),
  [torch.tensor(f['attention_mask_text2']) for f in features], batch_first=True),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [182]:
# Optionally, calculate metrics on test set predictions
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average="weighted")
precision = precision_score(true_labels, pred_labels, average="weighted")
recall = recall_score(true_labels, pred_labels, average="weighted")

# Print metrics
print("Test Set Evaluation:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Test Set Evaluation:
Accuracy: 0.6461038961038961
F1 Score: 0.6421041927303539
Precision: 0.6614733854189244
Recall: 0.6461038961038961


In [99]:
trainer.model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.45.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [139]:
trainer = tr

In [162]:
trainer.model == tr.model.base_model 

True

In [163]:
trainer.model == trainer.model.base_model

True

In [164]:
trainer.save_model("This")

In [168]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/minoosh/tmp_trainer/commit/cb6b7fcdff83a6ebdd75661ae69040779d88cc12', commit_message='End of training', commit_description='', oid='cb6b7fcdff83a6ebdd75661ae69040779d88cc12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minoosh/tmp_trainer', endpoint='https://huggingface.co', repo_type='model', repo_id='minoosh/tmp_trainer'), pr_revision=None, pr_num=None)

In [175]:
import os
import torch
from transformers import AutoModel, AutoConfig, AutoTokenizer, Trainer

def save_biencoder_model(trainer, save_path):
    """
    Save both the base model and the complete BiEncoder model structure
    """
    # Create directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    # 1. Save the complete model state dict
    torch.save(trainer.model.state_dict(), os.path.join(save_path, "pytorch_model.bin"))
    
    # 2. Save the base model separately
    trainer.model.base_model.save_pretrained(os.path.join(save_path, "base_model"))
    
    # 3. Save the model config
    trainer.model.config.save_pretrained(save_path)
    
    # 4. Save the tokenizer if it exists
    if hasattr(trainer, 'tokenizer'):
        trainer.tokenizer.save_pretrained(save_path)
    
    print(f"Model saved successfully to {save_path}")

def load_biencoder_model(load_path, num_classes=4):
    """
    Load the complete BiEncoder model structure
    """
    try:
        # 1. Load the base model and config
        print("Loading base model...")
        base_model = AutoModel.from_pretrained(os.path.join(load_path, "base_model"))
        config = AutoConfig.from_pretrained(os.path.join(load_path, "base_model"))
        
        # 2. Recreate the BiEncoder model
        print("Creating BiEncoder model...")
        model = BiEncoderModel(
            base_model=base_model,
            config=config,
            num_classes=num_classes
        )
        
        # 3. Load the complete state dict
        print("Loading model state...")
        state_dict_path = os.path.join(load_path, "pytorch_model.bin")
        if os.path.exists(state_dict_path):
            state_dict = torch.load(state_dict_path)
            model.load_state_dict(state_dict)
        else:
            raise FileNotFoundError(f"Model state dict not found at {state_dict_path}")
        
        # 4. Load the tokenizer
        print("Loading tokenizer...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(load_path)
        except:
            print("Warning: Tokenizer not found in save path")
            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        
        # 5. Create a new trainer instance
        print("Creating trainer...")
        trainer = Trainer(
            model=model,
            data_collator=BiEncoderCollator(),
            compute_metrics=compute_metrics
        )
        
        print("Model loaded successfully!")
        return trainer, model, tokenizer
    
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise
        
save_path = "test_claude_save"
save_biencoder_model(tr, save_path)

# Load the model
loaded_trainer, loaded_model, loaded_tokenizer = load_biencoder_model(save_path)

Model saved successfully to test_claude_save
Loading base model...
Creating BiEncoder model...
Loading model state...


  state_dict = torch.load(state_dict_path)


Loading tokenizer...
Creating trainer...
Model loaded successfully!


In [183]:
loaded_trainer.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/minoosh/tmp_trainer/commit/b22bf8476fc56f9acca6e58304666b5a2b63b7c0', commit_message='End of training', commit_description='', oid='b22bf8476fc56f9acca6e58304666b5a2b63b7c0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minoosh/tmp_trainer', endpoint='https://huggingface.co', repo_type='model', repo_id='minoosh/tmp_trainer'), pr_revision=None, pr_num=None)

In [194]:
import os
import json
from huggingface_hub import HfApi
from transformers import AutoModel, AutoConfig, AutoTokenizer, BertConfig

def save_and_push_to_hub(trainer, repo_id, token=None):
    """
    Save and push BiEncoder model to Hugging Face Hub
    """
    api = HfApi()
    
    try:
        temp_save_path = f"temp_save_{repo_id.split('/')[-1]}"
        os.makedirs(temp_save_path, exist_ok=True)
        
        print(f"Saving model to {temp_save_path}...")
        
        # 1. Save the base model configuration
        base_config = trainer.model.base_model.config.to_dict()
        base_config["model_type"] = "bert"  # Ensure we're using BERT as base
        base_config["architectures"] = ["BertModel"]
        
        with open(os.path.join(temp_save_path, "config.json"), 'w') as f:
            json.dump(base_config, f)
            
        # 2. Save model weights
        torch.save(trainer.model.state_dict(), os.path.join(temp_save_path, "pytorch_model.bin"))
        
        # 3. Save tokenizer
        print("Saving tokenizer...")
        if hasattr(trainer, 'tokenizer'):
            trainer.tokenizer.save_pretrained(temp_save_path)
        
        # 4. Create model card
        model_card = f"""---
language: en
tags:
- bert
- classification
- pytorch
pipeline_tag: text-classification
---

# BiEncoder Classification Model

This model is a BiEncoder architecture based on BERT for text pair classification.

## Model Details
- Base Model: bert-base-uncased
- Architecture: BiEncoder with BERT base
- Number of classes: {trainer.model.classifier.out_features}

## Usage

```python
from transformers import AutoTokenizer
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("{repo_id}")

# Load model weights
state_dict = torch.load("pytorch_model.bin")

# Initialize model (you'll need the BiEncoderModel class)
model = BiEncoderModel(
    base_model=AutoModel.from_pretrained("bert-base-uncased"),
    num_classes={trainer.model.classifier.out_features}
)
model.load_state_dict(state_dict)
```
"""
        with open(os.path.join(temp_save_path, "README.md"), 'w') as f:
            f.write(model_card)
        
        # 5. Push to hub
        print(f"Pushing to hub at {repo_id}...")
        api.upload_folder(
            folder_path=temp_save_path,
            repo_id=repo_id,
            token=token
        )
        
        print(f"Successfully pushed model to {repo_id}")
        
    except Exception as e:
        print(f"Error during push to hub: {str(e)}")
        raise
    finally:
        if os.path.exists(temp_save_path):
            import shutil
            shutil.rmtree(temp_save_path)

def load_from_hub(repo_id, num_classes=4):
    """
    Load BiEncoder model from Hugging Face Hub
    """
    try:
        print(f"Loading model from {repo_id}...")
        
        # 1. Initialize base model with BERT
        base_model = AutoModel.from_pretrained("bert-base-uncased")
        
        # 2. Create BiEncoder model
        model = BiEncoderModel(
            base_model=base_model,
            num_classes=num_classes
        )
        
        # 3. Load state dict
        state_dict = torch.hub.load_state_dict_from_url(
            f"https://huggingface.co/{repo_id}/resolve/main/pytorch_model.bin",
            map_location="cpu"
        )
        model.load_state_dict(state_dict)
        
        # 4. Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(repo_id)
        
        # 5. Create trainer
        trainer = Trainer(
            model=model,
            data_collator=BiEncoderCollator(),
            compute_metrics=compute_metrics
        )
        
        print("Model loaded successfully!")
        return trainer, model, tokenizer
        
    except Exception as e:
        print(f"Error loading model from hub: {str(e)}")
        raise

In [195]:
# To save and push to hub:
repo_id = "minoosh/repo"  # e.g., "minoosh/bert-biencoder-classification"
save_and_push_to_hub(tr, repo_id)

# To load from hub later:
loaded_trainer, loaded_model, loaded_tokenizer = load_from_hub(repo_id)

Saving model to temp_save_repo...
Saving tokenizer...
Pushing to hub at minoosh/repo...
Successfully pushed model to minoosh/repo
Loading model from minoosh/repo...


Downloading: "https://huggingface.co/minoosh/repo/resolve/main/pytorch_model.bin" to /root/.cache/torch/hub/checkpoints/pytorch_model.bin
100%|██████████| 418M/418M [00:10<00:00, 42.3MB/s] 


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Model loaded successfully!


In [193]:
!zip test_claude_save  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



zip error: Nothing to do! (test_claude_save.zip)
