In [1]:
import os
import torch

from datasets import Dataset
from data_processing import util
from model_utils.train import prepare_model_for_kbit_training
from peft import get_peft_model, LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [2]:
DATA_TYPE = "mbpt_0_top"
MODEL_MAX_LENGTH = 8192
CACHE_DIR = "/nlp/scr/neigbe/.cache"
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
model = ["llama3-8b-instruct", "llama3-70b-instruct"][0]
MODEL_PATH = util.get_model_path(model, DATA_TYPE)

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
train_df, valid_df, _ = util.get_data_splits(DATA_TYPE, .90, .5)

In [5]:
train_df.shape

(94, 4)

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    use_cache=False,
    quantization_config=bnb_config,
    attn_implementation = "flash_attention_2",
    cache_dir=CACHE_DIR
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
tkr = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
    model_max_length=MODEL_MAX_LENGTH,
    pad_token = "<|pad_id|>"
)

model.resize_token_embeddings(len(tkr))
model.config.pad_token_id = tkr.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules="all-linear",
    bias= "none",
    task_type= "CAUSAL_LM",
    lora_dropout=0.05,
    inference_mode= False,
)

prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [9]:
final_train_df = util.convert_to_message_fmt(train_df, tkr)
final_valid_df = util.convert_to_message_fmt(valid_df, tkr)

train = Dataset.from_pandas(final_train_df, split="train").with_format("torch")
valid = Dataset.from_pandas(final_valid_df, split="valid").with_format("torch")

In [10]:
instruction_template="<|start_header_id|>user<|end_header_id|>\n\n"
response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tkr)

In [11]:
batch_size = 1

training_args = TrainingArguments(
    output_dir=util.get_model_path("llama-3-8b-instruct", DATA_TYPE),
    evaluation_strategy="steps",
    logging_strategy="epoch",
    save_strategy="steps",
    eval_steps=5,
    num_train_epochs=3,
    save_total_limit = 5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    gradient_checkpointing_kwargs={"use_reentrant": True},
    load_best_model_at_end=True
)

In [12]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    peft_config=lora_config,
    max_seq_length=MODEL_MAX_LENGTH,
    data_collator=collator,
    packing=False,
    dataset_text_field="text",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/94 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

In [13]:
trainer.train()

trainer.save_model(MODEL_PATH)

Step,Training Loss,Validation Loss
5,No log,0.427156
10,No log,0.494469
15,No log,0.356485
20,No log,0.355603
25,0.589800,0.377654
30,0.589800,0.386952
35,0.589800,0.387686
40,0.589800,0.364336
45,0.589800,0.35366
50,0.325400,0.363482



` in the following instance: <|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

Read the scenes below and then categorize randal's personality as "I" for introverted or "E" for extroverted, according to the mbpt personality typology. Response with only one word.

scenes:

randal: Am I missing something here?
caitlin: I went back there, and Dante was already waiting for me.
randal: He was?
caitlin: It was so cool. He didn't say a word. He was just... ready, you know?  And we didn't kiss or talk or anything. He just sat there and let me do all the work.
randal: You dog! I didn't see you go back there.

caitlin: Randal Graves-scourge of the video renter.
randal: Ladies and gentleman, Mrs. Asian Design Major herself: Caitlin Bree!
caitlin: You saw that article? God, isn't it awful? My mother sent that in.
randal: I take it she likes the guy.
caitlin: You'd think she was marrying him. What are you watching?
randal: Children's programming. What did your mom say when

In [14]:
torch.cuda.empty_cache()