In [1]:
import datasets

In [9]:
import transformers

In [2]:
from bg2vec.arguments import *
from bg2vec.data_util import *
from bg2vec.model import *
from bg2vec.training import *

In [3]:
import os 
#os.putenv("HF_TOKEN", "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")

In [11]:
model_args, data_args, training_args, custom_args = parser.parse_json_file("model_configurations/bggpt-7b.json")

config_kwargs = {
    "cache_dir": model_args.cache_dir,
    "revision": model_args.model_revision,
    "token": model_args.token,
    "trust_remote_code": model_args.trust_remote_code,
}
if training_args.gradient_checkpointing:
    training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
# Set seed before initializing model.

transformers.set_seed(training_args.seed)

## Loading the model

In [11]:
from transformers import (
    AutoConfig)

config = AutoConfig.from_pretrained(
    model_args.model_name_or_path, **config_kwargs
)
model_class = get_model_class(config)
model_class

llm2vec.models.bidirectional_mistral.MistralBiForMNTP

In [13]:
torch_dtype = (
    model_args.torch_dtype
    if model_args.torch_dtype in ["auto", None]
    else getattr(torch, model_args.torch_dtype)
)
model = model_class.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    token=model_args.token,
    trust_remote_code=model_args.trust_remote_code,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=model_args.low_cpu_mem_usage,
    attn_implementation=model_args.attn_implementation,
)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
model

MistralBiForMNTP(
  (model): MistralBiModel(
    (embed_tokens): Embedding(38000, 4096)
    (layers): ModuleList(
      (0-31): 32 x ModifiedMistralDecoderLayer(
        (self_attn): ModifiedMistralFlashAttention2(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm

In [12]:
model.model.layers[0].self_attn.is_causal

NameError: name 'model' is not defined

In [24]:
model = initialize_peft(
    model,
    lora_r=custom_args.lora_r,
    lora_alpha=2 * custom_args.lora_r,
    lora_dropout=custom_args.lora_dropout,
)

Model's Lora trainable parameters:
trainable params: 41,943,040 || all params: 7,177,179,136 || trainable%: 0.5843944982453898


In [34]:
model.model.base_model.model.layers[0]

ModifiedMistralDecoderLayer(
  (self_attn): ModifiedMistralFlashAttention2(
    (q_proj): lora.Linear(
      (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
      (lora_dropout): ModuleDict(
        (default): Dropout(p=0.05, inplace=False)
      )
      (lora_A): ModuleDict(
        (default): Linear(in_features=4096, out_features=16, bias=False)
      )
      (lora_B): ModuleDict(
        (default): Linear(in_features=16, out_features=4096, bias=False)
      )
      (lora_embedding_A): ParameterDict()
      (lora_embedding_B): ParameterDict()
    )
    (k_proj): lora.Linear(
      (base_layer): Linear(in_features=4096, out_features=1024, bias=False)
      (lora_dropout): ModuleDict(
        (default): Dropout(p=0.05, inplace=False)
      )
      (lora_A): ModuleDict(
        (default): Linear(in_features=4096, out_features=16, bias=False)
      )
      (lora_B): ModuleDict(
        (default): Linear(in_features=16, out_features=1024, bias=False)
      )
      (l

## Setting up data collation

In [23]:
from transformers import AutoTokenizer
tokenizer_kwargs = {
    "cache_dir": model_args.cache_dir,
    "use_fast": model_args.use_fast_tokenizer,
    "revision": model_args.model_revision,
    "token": model_args.token,
    "trust_remote_code": model_args.trust_remote_code,
}
tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, **tokenizer_kwargs
)
tokenizer

LlamaTokenizerFast(name_or_path='INSAIT-Institute/BgGPT-7B-Instruct-v0.2', vocab_size=38000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [31]:
if tokenizer.mask_token is None:
    if custom_args.mask_token_type == "blank":
        tokenizer.mask_token = "_"
    elif custom_args.mask_token_type == "eos":
        tokenizer.mask_token = tokenizer.eos_token
    elif custom_args.mask_token_type == "mask":
        tokenizer.add_tokens(["<mask>"])
        tokenizer.mask_token = "<mask>"
    else:
        raise ValueError(
            f"mask_token_type {custom_args.mask_token_type} is not supported."
        )

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [70]:
data_collator = DataCollatorForLanguageModelingWithFullMasking(
    tokenizer=tokenizer,
    mlm_probability=data_args.mlm_probability
)

In [71]:
import torch

In [72]:
data_collator.tokenizer.vocab['_']

28730

In [78]:
data_collator( (torch.randint(0, 10, (1, 10)), ))

{'input_ids': tensor([[[28730,     6,     9, 28730, 28730, 28730, 28730, 28730, 28730,     4]]]),
 'labels': tensor([[[   4, -100, -100,    8,    0,    4,    7,    0,    5, -100]]])}

## Loading up the dataset

In [None]:
tokenized_datasets = datasets.load_from_disk("grouped_512")

In [None]:
train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None:
    max_train_samples = min(len(train_dataset), data_args.max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples))

In [None]:
eval_dataset = tokenized_datasets["validation"]
if data_args.max_eval_samples is not None:
    max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
    eval_dataset = eval_dataset.select(range(max_eval_samples))

## Setting up the trainer

In [80]:
trainer = MNTPTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,

)

trainer.add_callback(StopTrainingCallback(custom_args.stop_after_n_steps))
trainer.callback_handler.remove_callback(transformers.integrations.integration_utils.WandbCallback)

NameError: name 'train_dataset' is not defined

## Training

In [79]:
train_result = trainer.train()    

NameError: name 'trainer' is not defined

## Model export

In [None]:
model.save_pretrained("bggpt-mntp-pretrained-2")

In [None]:
trainer.save_model()  # Saves the tokenizer too for easy upload
metrics = train_result.metrics

max_train_samples = (
    data_args.max_train_samples
    if data_args.max_train_samples is not None
    else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()