In [None]:
! pip install datasets transformers evaluate peft bitsandbytes
! pip install accelerate -U



In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, PretrainedConfig, PreTrainedModel, DataCollatorWithPadding
from transformers.modeling_outputs import ModelOutput
from peft import LoraConfig, LoftQConfig, TaskType, get_peft_model
from dataclasses import dataclass
import evaluate
from typing import Optional

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cuda'

In [None]:
class E5DataLoader:
    def __init__(self, tokenizer, data_file):
        self.tokenizer = tokenizer
        dataset = load_dataset("csv", data_files=data_file, split='train')
        dataset = dataset.class_encode_column('label')
        dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='label')
        self.train_dataset, self.eval_dataset = dataset['train'], dataset['test']
        self.train_dataset.set_transform(self._transform)
        self.eval_dataset.set_transform(self._transform)


    def _transform(self, examples):
        docs = [f'passage: {doc}' for doc in examples['description']]
        queries = [f'query: {query}' for query in examples['comment']]

        assert len(docs) == len(queries)
        assert len(queries) == len(examples['label'])

        query_batch_dict = self.tokenizer(queries,
                                    max_length=512,
                                    truncation=True,
                                    )

        doc_batch_dict = self.tokenizer(docs,
                                    max_length=512,
                                    truncation=True,
                                    )

        merged_batch_dict = {f'q_{k}': v for k, v in query_batch_dict.items()}
        for k, v in doc_batch_dict.items():
            k = f'd_{k}'
            merged_batch_dict[k] = v

        merged_batch_dict['label'] = examples['label']

        return merged_batch_dict

In [None]:
class E5Trainer(Trainer):
    def __init__(self, *args, **kwargs):
        super(E5Trainer, self).__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

In [None]:
@dataclass
class E5Output(ModelOutput):
    query_embeddings: Optional[torch.Tensor] = None
    doc_embeddings: Optional[torch.Tensor] = None
    loss: Optional[torch.Tensor] = None
    labels: Optional[torch.Tensor] = None

class E5Config(PretrainedConfig):
    model_type = 'E5'

    def __init__(self, num_labels=2, **kwargs):
        super().__init__(**kwargs)
        self.num_labels = num_labels


class E5(PreTrainedModel):
    config_class = E5Config

    def __init__(self, config):
        super(E5, self).__init__(config)
        self.num_labels = config.num_labels
        self.e5 = AutoModel.from_pretrained('intfloat/multilingual-e5-large')
        self.cosine_loss = nn.CosineEmbeddingLoss()

    def forward(self, input_ids, attention_mask, labels, **kwargs):
        e5_outputs = self.e5(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        embeddings = e5_outputs.pooler_output
        loss = self.cosine_loss(input1=embeddings[embeddings.shape[0]//2:],
                                input2=embeddings[:embeddings.shape[0]//2],
                                target=labels
                                )
        return E5Output(loss=loss,
                        query_embeddings=embeddings[:embeddings.shape[0]//2],
                        doc_embeddings=embeddings[embeddings.shape[0]//2:],
                        labels=labels
                        )

In [None]:
class E5DataCollator(DataCollatorWithPadding):

    def __call__(self, examples):
        q_prefix, d_prefix = 'q_', 'd_'

        queries = [{k[len(q_prefix):]: v for k, v in example.items() if q_prefix in k}
                        for example in examples]

        docs = [{k[len(d_prefix):]: v for k, v in example.items() if d_prefix in k}
                        for example in examples]

        batch_collated = self.tokenizer.pad(
            queries + docs,
            padding=self.padding,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors
        )

        batch_collated['labels'] = torch.tensor([example['label'] for example in examples])

        return batch_collated

In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
data_file = 'temp_data.csv'
data_loader = E5DataLoader(tokenizer,data_file=data_file)
train_data = data_loader.train_dataset
eval_data = data_loader.eval_dataset
threshold = 0.7

loftq_config = LoftQConfig(loftq_bits=8)
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS if device == 'cuda' else None,
                         init_lora_weights="loftq" if device == 'cuda' else dict(),
                         loftq_config=loftq_config,
                         target_modules=[
                           'query',
                           'key'
                         ],
                         inference_mode=False,
                         r=8,
                         lora_alpha=32,
                         lora_dropout=0.1
                         )


config = E5Config()
model = E5(config)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
print(model)

data_collator = E5DataCollator(
    tokenizer=tokenizer,
    max_length=512
)

def compute_metrics(eval_pred):
    query_embeddings = torch.from_numpy(eval_pred.predictions[0])
    doc_embeddings = torch.from_numpy(eval_pred.predictions[1])
    assert len(query_embeddings) == len(doc_embeddings)

    similarity = torch.cosine_similarity(query_embeddings, doc_embeddings)
    predictions = torch.where(similarity > threshold, 1, 0)

    labels = torch.tensor(eval_pred.label_ids, dtype=int)
    metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    return metrics.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir='saved_models/e5nn',
    evaluation_strategy='steps',
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_strategy='steps',
    save_steps=0.2,
    logging_steps=0.2,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels']
)

trainer = E5Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


E5(
  (e5): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              

In [None]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([-0.6524,  0.3310, -0.5092,  0.8414, -0.3137], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.7115,  0.2365, -0.4976,  0.9175, -0.3596], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
15,0.0769,0.003709,0.945946,0.972222,0.945946,1.0
30,0.0198,0.000882,0.945946,0.972222,0.945946,1.0
45,0.0119,0.000545,0.945946,0.972222,0.945946,1.0
60,0.012,0.000456,0.945946,0.972222,0.945946,1.0


tensor([-0.6379,  0.1746, -0.7789,  0.8625, -0.2154], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.5069,  0.5813, -0.2542,  0.7018, -0.4372], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.2040,  0.4908, -0.0055,  0.9404, -0.3184], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.8366,  0.3551, -0.2691,  0.7686, -0.6373], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.7290,  0.7581, -0.6951,  0.8206, -0.5719], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.7879,  0.6017, -0.5773,  0.9162, -0.5781], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.5708,  0.5636, -0.5094,  0.9093, -0.5205], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.6929,  0.4823, -0.6933,  0.8674, -0.6882], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.7797,  0.6060, -0.7878,  0.8839, -0.7012], device='cuda:0',
       gr

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

tensor([-0.8048,  0.5707, -0.6840,  0.9106, -0.7562], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.8414,  0.6095, -0.7559,  0.8693, -0.6884], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.7356,  0.5174, -0.7485,  0.7228, -0.7475], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.7174,  0.7276, -0.6974,  0.9350, -0.7841], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.8361,  0.6145, -0.7594,  0.8695, -0.8305], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.7947,  0.6792, -0.6512,  0.9348, -0.6874], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.8050,  0.5161, -0.6461,  0.8926, -0.7077], device='cuda:0',
       grad_fn=<SliceBackward0>) tensor([-0.7051,  0.6861, -0.5122,  0.8982, -0.7493], device='cuda:0',
       grad_fn=<SliceBackward0>) torch.Size([4, 1024])
tensor([-0.8025,  0.6727, -0.7428,  0.9232, -0.8356], device='cuda:0',
       gr

TrainOutput(global_step=73, training_loss=0.02665455210698794, metrics={'train_runtime': 365.5396, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.2, 'total_flos': 203895011450880.0, 'train_loss': 0.02665455210698794, 'epoch': 1.0})