In [2]:
import evaluate
import numpy as np
import os
import torch

from datasets import Dataset
from data_processing import util
from model_utils.evaluate import compute_metrics
from model_utils.train import prepare_model_for_kbit_training
from peft import get_peft_model, LoraConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, EarlyStoppingCallback, TrainingArguments, Trainer

In [3]:
PWD = os.environ["WORKSPACE_PATH"]
DATA_TYPE = "mbpt_0_top"
CACHE_DIR = "/nlp/scr/neigbe/.cache"
MODEL_NAME =  "meta-llama/Meta-Llama-3-8B-Instruct"
model = ["llama3-8b-instruct", "llama3-70b-instruct"][0]
MODEL_PATH = util.get_model_path(model, DATA_TYPE)

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [5]:
train_df, valid_df, _ = util.get_data_splits(DATA_TYPE, test_size=.25, valid_size=.2)

util.encode_labels(train_df, DATA_TYPE)
util.encode_labels(valid_df, DATA_TYPE)

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    num_labels=2,
    use_cache=False,
    quantization_config=bnb_config,
    attn_implementation = "flash_attention_2",
    cache_dir=CACHE_DIR
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tkr = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR, model_max_length=8192)
tkr.pad_token_id = tkr.eos_token_id

model.resize_token_embeddings(len(tkr))
model.config.pad_token_id = tkr.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules="all-linear",
    bias= "none",
    task_type= "SEQ_CLS",
    lora_dropout=0.05,
    inference_mode= False,
)

prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [9]:
train = Dataset.from_pandas(train_df, split="train").with_format("torch")
valid = Dataset.from_pandas(valid_df, split="valid").with_format("torch")

In [10]:
tokenize = lambda data: tkr(data["text"], padding="max_length", truncation=True, return_tensors="pt")
train_tk = train.map(tokenize, batched=True)
valid_tk = valid.map(tokenize, batched=True)

Map:   0%|          | 0/1146 [00:00<?, ? examples/s]

Map:   0%|          | 0/279 [00:00<?, ? examples/s]

In [12]:
batch_size = 8

training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    save_total_limit = 5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    metric_for_best_model="f1",
    # torch_compile=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": True},
    bf16=True,
    optim="paged_adamw_8bit",
    load_best_model_at_end=True,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tk,
    eval_dataset=valid_tk,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=.005)],
)

In [None]:
trainer.train()

trainer.save_model(MODEL_PATH)

: 