In [1]:
!pip install -q transformers datasets wandb

In [2]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch

from datasets import load_dataset

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TrainingArguments, Trainer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import wandb



# Initialize wandb

wandb.init(

    project="bert-crossencoder-classification"

)



# Load dataset

dataset = load_dataset("minoosh/EPITOME_pairs")



# Initialize the tokenizer and model for cross-encoder setup

model_name = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)



# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]

def preprocess_function(examples):

    # Concatenate both texts with a [SEP] token in between

    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

    encodings['labels'] = examples['label']  # Add labels

    return encodings



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Set format for PyTorch

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



# Define compute_metrics function for classification evaluation

def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    preds = predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, preds)

    precision = precision_score(labels, preds, average="weighted")

    recall = recall_score(labels, preds, average="weighted")

    f1 = f1_score(labels, preds, average="weighted")

    return {

        "accuracy": accuracy,

        "precision": precision,

        "recall": recall,

        "f1": f1

    }



# Custom Cross-Encoder model class for classification

class CrossEncoderModel(torch.nn.Module):

    def __init__(self, model_name, num_classes=4, loss_fn="cross_entropy"):

        super(CrossEncoderModel, self).__init__()

        # Load model config

        self.config = AutoConfig.from_pretrained(model_name, num_labels=num_classes)

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)

        self.loss_fn = loss_fn



    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits  # Output logits for classification



        loss = None

        if labels is not None:

            if self.loss_fn == "cross_entropy":

                loss_fct = torch.nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification

                loss = loss_fct(logits, labels)

            elif self.loss_fn == "focal_loss":

                # Focal loss implementation for handling class imbalance

                alpha = 0.25

                gamma = 2.0

                ce_loss = torch.nn.CrossEntropyLoss(reduction="none")(logits, labels)

                pt = torch.exp(-ce_loss)  # Probability of the true class

                loss = (alpha * (1 - pt) ** gamma * ce_loss).mean()

            elif self.loss_fn == "kl_divergence":

                # KL Divergence for soft-label classification

                kl_div = torch.nn.KLDivLoss(reduction="batchmean")

                soft_labels = torch.nn.functional.one_hot(labels, num_classes=self.config.num_labels).float()

                log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

                loss = kl_div(log_probs, soft_labels)

            else:

                raise ValueError(f"Unsupported loss function: {self.loss_fn}")



        return {"loss": loss, "logits": logits}


# Function to initialize and train the cross-encoder model

def train_crossencoder(loss_fn):

    model = CrossEncoderModel(model_name=model_name, loss_fn=loss_fn)



    # Set up TrainingArguments

    training_args = TrainingArguments(

        output_dir=f"./output/TTTTempathy-crossencoder-{loss_fn}",

        evaluation_strategy="epoch",

        logging_dir='./logs',

        logging_steps=10,

        per_device_train_batch_size=wandb.config['batch_size'],

        per_device_eval_batch_size=wandb.config['batch_size'],

        num_train_epochs=wandb.config['epochs'],

        warmup_steps=100,

        learning_rate=wandb.config['learning_rate'],

        weight_decay=0.01,

        report_to="wandb",

        save_strategy="epoch",

        load_best_model_at_end=True,

        push_to_hub=True,

        save_total_limit=2

    )



    # Initialize Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_train,

        eval_dataset=tokenized_val,

        tokenizer=tokenizer,

        compute_metrics=compute_metrics

    )



    # Train the model

    trainer.train()



    # Evaluate the model on the test set

    trainer.evaluate(tokenized_test)



    #trainer.model.save_pretrained(f"./output/TTTTempathy-crossencoder-{loss_fn}")



    # Save and push the model to the Hugging Face Hub

    #trainer.save_model(f"./output/TTTTempathy-crossencoder-{loss_fn}")

    #trainer.push_to_hub(f"minoosh/TTTTempathy-crossencoder-{loss_fn}")



    # End the wandb run

    wandb.finish()

    return trainer

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112901844444422, max=1.0…

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/660k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/100k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/88.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2467 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/309 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [4]:
# Specify list of loss functions to try

loss_functions = ["cross_entropy", "focal_loss", "kl_divergence"]



loss_fn = loss_functions[0]  # Change to desired loss function

wandb.init(project="bert-crossencoder-classification", name=f"bert-crossencoder-classification-{loss_fn}", config={"epochs": 3, "batch_size": 16, "learning_rate": 2e-5})

tr = train_crossencoder(loss_fn)

wandb.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2236,1.136467,0.517799,0.525199,0.517799,0.457086
2,1.0094,0.998384,0.579288,0.574253,0.579288,0.563826
3,0.878,0.955802,0.598706,0.589249,0.598706,0.587973


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


VBox(children=(Label(value='0.029 MB of 0.029 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▆█▇
eval/f1,▁▇██
eval/loss,█▃▁▃
eval/precision,▁▆██
eval/recall,▁▆█▇
eval/runtime,▆█▆▁
eval/samples_per_second,▃▁▃█
eval/steps_per_second,▃▁▃█
train/epoch,▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇█████
train/global_step,▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇█████

0,1
eval/accuracy,0.59091
eval/f1,0.58041
eval/loss,1.01562
eval/precision,0.59106
eval/recall,0.59091
eval/runtime,5.1342
eval/samples_per_second,59.99
eval/steps_per_second,1.948
total_flos,0.0
train/epoch,3.0


# prediction

In [30]:
def compute_metrics2(preds, labels):

    #predictions, labels = eval_pred

    #preds = predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, preds)

    precision = precision_score(labels, preds, average="weighted")

    recall = recall_score(labels, preds, average="weighted")

    f1 = f1_score(labels, preds, average="weighted")

    return {

        "accuracy": accuracy,

        "precision": precision,

        "recall": recall,

        "f1": f1

    }

In [40]:
# Load dataset

dataset = load_dataset("minoosh/EPITOME_pairs")



tokenizer = tr.tokenizer



# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]

def preprocess_function(examples):

    # Concatenate both texts with a [SEP] token in between

    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

    encodings['labels'] = examples['label']  # Add labels

    return encodings



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Set format for PyTorch

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [41]:
wandb.init()
a = tr.predict(tokenized_test)
a

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



PredictionOutput(predictions=array([[-0.45503232, -1.0947487 ,  1.9089473 , -0.24962118],
       [ 0.80990165,  0.97936547, -0.61233205, -0.8243233 ],
       [ 0.36533967, -0.05923421,  0.59694076, -0.83456737],
       ...,
       [ 0.98752993, -0.4267132 ,  1.1035475 , -1.1273757 ],
       [-0.8679848 , -1.2133474 ,  1.8939304 ,  0.01474193],
       [ 0.9232104 , -0.33683768,  1.0794307 , -1.1720507 ]],
      dtype=float32), label_ids=array([0, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 2, 0, 3,
       2, 1, 3, 0, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 2, 3, 1, 2, 2, 3,
       0, 1, 2, 2, 0, 2, 2, 3, 3, 2, 1, 3, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0,
       0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 3, 2, 1, 0, 1, 0, 2, 0, 2, 2, 1,
       2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 3, 2, 2, 1, 3, 2, 1, 1, 2, 3,
       3, 3, 3, 2, 1, 2, 0, 3, 0, 3, 3, 3, 3, 2, 2, 0, 0, 2, 1, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 1, 0, 0, 1, 

In [42]:
actual_labels = tokenized_test['labels'].tolist()
predicted_classes = list(a.predictions.argmax(axis=1))
res = compute_metrics2(actual_labels, predicted_classes)

res

{'accuracy': 0.5909090909090909,
 'precision': 0.6299192354580286,
 'recall': 0.5909090909090909,
 'f1': 0.6014127717614769}

# Predictions from saved model

In [72]:
# Load dataset

dataset = load_dataset("minoosh/EPITOME_pairs")


model_name = "minoosh/tmp_trainer"

tokenizer = AutoTokenizer.from_pretrained(model_name)


# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]

def preprocess_function(examples):

    # Concatenate both texts with a [SEP] token in between

    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

    encodings['labels'] = examples['label']  # Add labels

    return encodings



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Set format for PyTorch

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

OSError: Can't load tokenizer for 'minoosh/tmp_trainer'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'minoosh/tmp_trainer' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.

In [73]:
# Load the model and tokenizer

model_name = "minoosh/tmp_trainer"

model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize the Trainer

trainer = Trainer(model=model)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [74]:
b = trainer.predict(tokenized_test)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [75]:
actual_labels = tokenized_test['labels'].tolist()
predicted_classes = list(b.predictions.argmax(axis=1))
res = compute_metrics2(actual_labels, predicted_classes)

res

{'accuracy': 0.5909090909090909,
 'precision': 0.6299192354580286,
 'recall': 0.5909090909090909,
 'f1': 0.6014127717614769}

In [48]:
actual_labels = tokenized_test['labels'].tolist()
predicted_classes = list(b.predictions.argmax(axis=1))
res = compute_metrics2(actual_labels, predicted_classes)

res

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.16233766233766234,
 'precision': 0.9967532467532467,
 'recall': 0.16233766233766234,
 'f1': 0.2792025901269598}

# This line saves moo

In [63]:
tr.model.model.save_pretrained("moo")

In [None]:
trainer.push_to_hub() #push 'moo' to hub

# saved from disk

In [64]:
# Load dataset

dataset = load_dataset("minoosh/EPITOME_pairs")


model_name = "moo"

tokenizer = AutoTokenizer.from_pretrained(model_name)


# Preprocess data for the cross-encoder model by concatenating text1 and text2 with [SEP]

def preprocess_function(examples):

    # Concatenate both texts with a [SEP] token in between

    encodings = tokenizer(examples['text1'], examples['text2'], truncation=True, padding=True, max_length=512)

    encodings['labels'] = examples['label']  # Add labels

    return encodings



# Apply tokenization

tokenized_train = dataset['train'].map(preprocess_function, batched=True)

tokenized_test = dataset['test'].map(preprocess_function, batched=True)

tokenized_val = dataset['validation'].map(preprocess_function, batched=True)



# Set format for PyTorch

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/2467 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [65]:
# Load the model and tokenizer

model_name = "moo"

model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize the Trainer

trainer = Trainer(model=model)

In [66]:
c = trainer.predict(tokenized_test)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [69]:
c

PredictionOutput(predictions=array([[-0.45503157, -1.0947485 ,  1.9089471 , -0.24962176],
       [ 0.80990183,  0.97936547, -0.6123324 , -0.8243238 ],
       [ 0.36533806, -0.05923416,  0.5969407 , -0.83456695],
       ...,
       [ 0.98752975, -0.42671373,  1.1035484 , -1.1273757 ],
       [-0.86798465, -1.2133476 ,  1.8939301 ,  0.01474158],
       [ 0.92321026, -0.33683765,  1.0794307 , -1.1720507 ]],
      dtype=float32), label_ids=array([0, 1, 0, 0, 1, 2, 0, 3, 2, 2, 0, 0, 2, 2, 2, 1, 0, 1, 1, 2, 0, 3,
       2, 1, 3, 0, 0, 2, 3, 2, 2, 2, 0, 0, 2, 2, 2, 3, 2, 3, 1, 2, 2, 3,
       0, 1, 2, 2, 0, 2, 2, 3, 3, 2, 1, 3, 0, 1, 3, 2, 2, 1, 2, 2, 2, 0,
       0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 3, 2, 1, 0, 1, 0, 2, 0, 2, 2, 1,
       2, 2, 2, 0, 2, 1, 0, 2, 0, 1, 2, 0, 3, 2, 2, 1, 3, 2, 1, 1, 2, 3,
       3, 3, 3, 2, 1, 2, 0, 3, 0, 3, 3, 3, 3, 2, 2, 0, 0, 2, 1, 2, 0, 0,
       2, 0, 0, 2, 2, 2, 3, 0, 2, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 1, 3, 0, 1, 0, 1, 0, 0, 1, 

In [70]:
actual_labels = tokenized_test['labels'].tolist()
predicted_classes = list(c.predictions.argmax(axis=1))
res = compute_metrics2(actual_labels, predicted_classes)

res

{'accuracy': 0.5909090909090909,
 'precision': 0.6299192354580286,
 'recall': 0.5909090909090909,
 'f1': 0.6014127717614769}

In [71]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/minoosh/tmp_trainer/commit/271cb78d69d72f989b362e7f4e74900e4003b2dd', commit_message='End of training', commit_description='', oid='271cb78d69d72f989b362e7f4e74900e4003b2dd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/minoosh/tmp_trainer', endpoint='https://huggingface.co', repo_type='model', repo_id='minoosh/tmp_trainer'), pr_revision=None, pr_num=None)