In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, pipeline
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datetime import datetime
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from trl import setup_chat_format
from trl import SFTTrainer

In [None]:
base_model_id = "/home/namnt/md1/NAMNT_DA2/base_llm/T-VisStar-7B-v0.1"
output_dir = "/mnt/md1/mlflow/DATN/26_09_2024"

In [None]:
# define function for convert dataset to type of conversation
system_message = """Là một chuyên gia đọc hiểu, hãy trả lời question dưới đây dựa vào context mà tôi cung cấp.
Câu trả lời ngắn gọn, chính xác. Nếu câu nào không có câu trả lời thì hãy trả về không tìm thấy thông tin.
Dưới đây là thông tin của context: {context}
"""
 
def create_conversation(row):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=row["context"])},
      {"role": "user", "content": row["question"]},
      {"role": "assistant", "content": row["answer"]}
    ]
  }

In [None]:
# apply create_conversation

train_csv = '/point/namnt/DATN/genneration/data/qa_output.csv'
test_csv = '/point/namnt/DATN/genneration/data/data_test.csv'

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the prompt template to the train dataset
train_dataset = train_dataset.map(create_conversation)
test_dataset = test_dataset.map(create_conversation)

In [None]:
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=self.max_length,
    padding_side="left",
    add_eos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token



In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id, 
    quantization_config=quantization_config,
    device_map="auto",
    attn_implementation="flash_attention_2"
)

# Enable gradient checkpointing for efficiency
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", 
        "up_proj", "down_proj", "lm_head"
    ],
    bias="none",
)
# Load peft_config
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    report_to="tensorboard",
    run_name= f"Vistral-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
    output_dir= output_dir,
    do_train = True,
    do_eval= True,
    per_device_train_batch_size= 1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    bf16=True,
    learning_rate=1e-5,
    lr_scheduler_type="constant",
    eval_strategy = "steps",
    save_strategy = "steps",
    max_steps=8000,
    save_steps=250,
    logging_steps=250,
    eval_steps=250,
    warmup_steps=250,
    ddp_find_unused_parameters=False,
    )

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset= train_dataset, 
    eval_dataset = test_dataset,
    max_seq_length= 3072,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

In [None]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()