In [1]:
import os
import pandas as pd
from datetime import datetime
from datasets import Dataset
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import setup_chat_format,SFTTrainer


  from .autonotebook import tqdm as notebook_tqdm


[2024-10-07 09:45:43,804] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/point/anaconda/envs/datn/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/point/anaconda/envs/datn/compiler_compat/ld: /usr/local/cuda/lib64/libc

In [2]:
base_model_id = "/home/namnt/md1/NAMNT_DA2/base_llm/T-VisStar-7B-v0.1"
output_dir = "/mnt/md1/mlflow/DATN/07_10_2024"

In [3]:
# define function for convert dataset to type of conversation
system_message = """Là một chuyên gia đọc hiểu, hãy trả lời question dưới đây dựa vào context mà tôi cung cấp.
Câu trả lời ngắn gọn, chính xác. Nếu câu nào không có câu trả lời thì hãy trả về không tìm thấy thông tin.
Dưới đây là thông tin của context: {context}
"""
 
def create_conversation(row):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(context=row["context"])},
      {"role": "user", "content": row["question"]},
      {"role": "assistant", "content": row["answer"]}
    ]
  }

In [4]:
# apply create_conversation

train_csv = '/point/namnt/DATN/genneration/data/qa_output.csv'
test_csv = '/point/namnt/DATN/genneration/data/data_test.csv'

train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the prompt template to the train dataset
train_dataset = train_dataset.map(create_conversation)
test_dataset = test_dataset.map(create_conversation)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1166/1166 [00:00<00:00, 6019.35 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 4734.57 examples/s]


In [5]:
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = 'right'



In [20]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id, 
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
)

# Enable gradient checkpointing for efficiency
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:21<00:00,  1.29s/it]


In [21]:
# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [22]:
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=16,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", 
        "up_proj", "down_proj", "lm_head"
    ],
    bias="none",
)
# Load peft_config
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

In [23]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")

GPU is available: NVIDIA GeForce RTX 3090 Ti


In [32]:
args = TrainingArguments(
    output_dir=output_dir, #  directory to save and repository id
    num_train_epochs=5,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass           # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps= 20,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-5,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub= False,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [33]:
# training_args = TrainingArguments(
#     report_to="tensorboard",
#     run_name= f"Vistral-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
#     output_dir= output_dir,
#     do_train = True,
#     do_eval= True,
#     per_device_train_batch_size= 1,
#     per_device_eval_batch_size=1,
#     gradient_accumulation_steps=4,
#     gradient_checkpointing=True,
#     optim="paged_adamw_8bit",
#     bf16=True,
#     learning_rate=1e-5,
#     lr_scheduler_type="constant",
#     eval_strategy = "steps",
#     save_strategy = "steps",
#     max_steps=8000,
#     save_steps=250,
#     logging_steps=250,
#     eval_steps=250,
#     warmup_steps=250,
#     ddp_find_unused_parameters=False,
#     )


In [34]:
import torch
print(torch.__version__)


2.4.0+cu121


In [35]:

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset= train_dataset, 
    eval_dataset = test_dataset,
    peft_config=peft_config,
    max_seq_length= 3072,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 168 examples [00:01, 152.03 examples/s]
Generating train split: 8 examples [00:00, 123.30 examples/s]


In [36]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,1.3213
40,0.8896
60,0.6687
80,0.4857
100,0.3502
120,0.2785
140,0.2067
160,0.1816
180,0.1374
200,0.1285


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=210, training_loss=0.4485364976383391, metrics={'train_runtime': 3352.5393, 'train_samples_per_second': 0.251, 'train_steps_per_second': 0.063, 'total_flos': 1.111576425136128e+17, 'train_loss': 0.4485364976383391, 'epoch': 5.0})