In [1]:
import logging
from dataclasses import dataclass, field
import os
import sys
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
from torch import nn

from accelerate import Accelerator, DistributedDataParallelKwargs
from accelerate.logging import get_logger

import transformers
from transformers import (
    MODEL_FOR_MASKED_LM_MAPPING,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    set_seed,
)
from transformers.trainer_utils import seed_worker

from peft import LoraConfig, get_peft_model

from llm2vec import LLM2Vec
from llm2vec.dataset.utils import load_dataset
from llm2vec.loss.utils import load_loss

from tqdm import tqdm

MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [2]:
from bg2vec.arguments import simcse_parser
    
model_args, data_args, training_args, custom_args = simcse_parser.parse_json_file(
        "model_configurations/bggpt-7b-simcse.json"
    )
if training_args.ddp_find_unused_parameters:
    kwargs = [
        DistributedDataParallelKwargs(
            dim=0,
            broadcast_buffers=True,
            bucket_cap_mb=25,
            find_unused_parameters=True,
            check_reduction=False,
            gradient_as_bucket_view=False,
        )
    ]
else:
    kwargs = []

In [3]:
accelerator = Accelerator(kwargs_handlers=kwargs)

set_seed(training_args.seed)

if training_args.gradient_checkpointing:
    training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}

In [4]:
import datasets

In [5]:
bgwiki = datasets.load_dataset("mboyanov/bgwiki")

In [6]:
from bg2vec.data_util import PairedDataset

In [7]:
train_dataset = PairedDataset(bgwiki['train'])

In [8]:
train_examples = [
    train_dataset[i]
    for i in tqdm(
        range(len(train_dataset)),
        desc="Loading train examples...",
        disable=not accelerator.is_main_process,
    )
]

torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)
    

Loading train examples...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 294757/294757 [00:00<00:00, 687366.68it/s]


In [9]:
model_args

SimCSEModelArguments(model_name_or_path='INSAIT-Institute/BgGPT-7B-Instruct-v0.2', peft_model_name_or_path='mboyanov/bggpt-mntp', bidirectional=True, max_seq_length=512, torch_dtype='bfloat16', attn_implementation='sdpa', pooling_mode='mean')

In [10]:
model = LLM2Vec.from_pretrained(
    base_model_name_or_path=model_args.model_name_or_path,
    enable_bidirectional=model_args.bidirectional,
    peft_model_name_or_path=model_args.peft_model_name_or_path,
    merge_peft=True,
    pooling_mode=model_args.pooling_mode,
    max_length=model_args.max_seq_length,
    torch_dtype=torch_dtype,
    attn_implementation=model_args.attn_implementation,
    attention_dropout=custom_args.simcse_dropout,
    cache_dir="/data/bggpt/"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
model.encode??

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msentences[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbatch_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m32[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshow_progress_bar[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconvert_to_numpy[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mconvert_to_tensor[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdevice[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mencode[0m[0;34m([0m[0;34m[0m


In [15]:
model.cpu().encode(["hello","hi"], device='cpu').shape

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([2, 4096])

In [16]:
from bg2vec.model import initialize_peft

In [17]:
model

LLM2Vec(
  (model): MistralBiModel(
    (embed_tokens): Embedding(38000, 4096)
    (layers): ModuleList(
      (0-31): 32 x ModifiedMistralDecoderLayer(
        (self_attn): ModifiedMistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralR

In [18]:
# model organization is LLM2VecModel.model -> HF Model, we have to apply PEFT to the inner model
model.model = initialize_peft(
    model.model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

Model's Lora trainable parameters:
trainable params: 41,943,040 || all params: 7,177,179,136 || trainable%: 0.5843944982453898


In [19]:
tokenizer = model.tokenizer

In [20]:
from bg2vec.training import SimCSEDefaultCollator, SimCSETrainer

In [21]:
train_loss = load_loss(custom_args.loss_class, scale=custom_args.loss_scale)

data_collator = SimCSEDefaultCollator(model)

In [23]:
trainer = SimCSETrainer(
    model=model,
    args=training_args,
    train_dataset=train_examples,
    data_collator=data_collator,
    tokenizer=tokenizer,
    loss_function=train_loss,
)

if custom_args.stop_after_n_steps is not None:
    trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))

trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 7.75 GiB of which 13.06 MiB is free. Including non-PyTorch memory, this process has 7.73 GiB memory in use. Of the allocated memory 7.50 GiB is allocated by PyTorch, and 138.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)