In [None]:
!pip install accelerate==0.21.0
!pip install peft==0.4.0
!pip install bitsandbytes==0.40.2
!pip install transformers==4.31.0
!pip install trl==0.4.7
# !pip install xformers
!pip install torch==2.0.1
!pip install neptune
!pip install scipy
!pip install sentencepiece

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers (from peft==0.4.0)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors (from peft==0.4.0)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,

In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
import os
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from typing import Optional


class Config:
    # model_name: Optional[str] = "naot97/medical"
    # tokenizer_name: Optional[str] ="naot97/medical"
    model_name: Optional[str] = "ura-hcmut/ura-llama-7b"
    tokenizer_name: Optional[str] ="ura-hcmut/ura-llama-7b"

    # bitsandbytes parameters
    use_4bit: bool = True
    bnb_4bit_compute_dtype: str = "bfloat16"
    bnb_4bit_quant_type: str = "nf4"
    use_nested_quant: bool = False

    # Torch dtype parameters
    fp16: bool = True
    bf16: bool = True
    tf32: bool = False

    # LoRa parameters
    use_lora: Optional[bool] = True
    lora_r: Optional[int] = 32
    lora_alpha: Optional[int] = 16
    lora_dropout: Optional[float] = 0.1

    # TrainingArguments parameters
    output_dir: str = "/content/gdrive/MyDrive/kalapa/results"
    new_model: str = "kalapa"
    num_train_epochs: Optional[int] = 100

    fp16: Optional[bool] = False
    bf16: Optional[bool] = False
    tf32: Optional[bool] = False

    auto_find_batch_size: Optional[bool] = True

    per_device_train_batch_size: Optional[int] =16
    per_device_eval_batch_size: Optional[int] = 1

    gradient_accumulation_steps: Optional[int] = 8

    gradient_checkpointing: Optional[bool] =True
    max_grad_norm: Optional[float] = 0.3

    learning_rate: Optional[float] =1e-5

    weight_decay: Optional[int] = 0.001

    optim: Optional[str] = "adamw_bnb_8bit"
    lr_scheduler_type: str ="constant"
    max_steps: int =-1

    warmup_ratio = 0.03
    group_by_length: bool = True
    save_steps: float = 1
    logging_steps: int =1
    resume_from_checkpoint: bool = False

    # SFT parameters
    max_seq_length: Optional[int] = 1024
    packing: Optional[bool] = False,

config = Config

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModelForCausalLM, AutoPeftModelForCausalLM

def get_model(config):
    device_map = "auto"

    # Load tokenizer and model with QLoRA configuration
    compute_dtype = getattr(torch, config.bnb_4bit_compute_dtype)

    # Check GPU compatibility with bfloat16
    if compute_dtype == torch.float16 and config.use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16: accelerate training with --bf16")
            print("=" * 80)

            config.fp16 = False
            config.bf16 = True
            config.tf32 = True

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=config.use_4bit,
        bnb_4bit_quant_type=config.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=config.use_nested_quant,
    )

    # Load base model
    # model = AutoPeftModelForCausalLM.from_pretrained(
    #     config.model_name,
    #     is_trainable = True,
    #     device_map=device_map,
    #     quantization_config=bnb_config,
    # )
    model = AutoModelForCausalLM.from_pretrained(
        config.model_name,
        device_map=device_map,
        quantization_config=bnb_config,
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Load LLaMA tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        config.tokenizer_name,
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

    return model, tokenizer

In [None]:
model, tokenizer = get_model(config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from datasets import load_dataset
def get_dataset(mode):
  if mode == 'pretraining':
    return load_dataset('csv', data_files="/content/kalapa.csv", split='train', num_proc=8)
  else:
    return load_dataset('vietgpt/wikipedia_vi', split='train[:100]', num_proc=8)

dataset = get_dataset('pretraining')

In [None]:
dataset[1]

{'Unnamed: 0': 1,
 'id': 1,
 'name': 'alzheimer',
 'text': 'bệnh alzheimer là gì?\n alzheimer là một căn bệnh gây ra tình trạng mất trí nhớ, mất các chức năng nhận thức, làm ảnh hưởng nhiều đến chất lượng sống và làm việc của người bệnh. tuy nhiên đây không phải là sự lão hóa bình thường, vì vậy đừng nhầm lẫn alzheimer với hiện tượng suy giảm trí nhớ thông thường ở người già. (1)\n có một ngày bạn bỗng thấy ông, bà, cha, mẹ,&#8230; càng có tuổi sẽ càng trở nên khó tính, dễ nổi cáu, hay hờn dỗi&#8230; điều đó có thể xuất phát từ tính cách trước nay vẫn vậy, nhưng cũng rất có thể họ đang bị hội chứng alzheimer âm thầm tấn công…\n'}

In [None]:
type(dataset)

datasets.arrow_dataset.Dataset

In [None]:
import os
import neptune
import torch
import transformers
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
)
from trl import SFTTrainer


# Load LoRA configuration
if config.use_lora:
    peft_config = LoraConfig(
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        r=config.lora_r,
        # target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
        bias="none",
        task_type="CAUSAL_LM",
    )
else:
    peft_config = None

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 16,777,216 || all params: 3,517,190,144 || trainable%: 0.477006226934315


In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=config.output_dir,
    num_train_epochs=config.num_train_epochs,
    auto_find_batch_size=config.auto_find_batch_size,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    save_steps=10,
    save_total_limit=10,
    logging_steps=config.logging_steps,
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    fp16=config.fp16,
    bf16=config.bf16,
    tf32=config.tf32,
    max_grad_norm=config.max_grad_norm,
    max_steps=config.max_steps,
    warmup_ratio=config.warmup_ratio,
    lr_scheduler_type=config.lr_scheduler_type,
    dataloader_num_workers=2,
    push_to_hub=False,
    report_to="none",
    group_by_length=True,
    load_best_model_at_end=False,
)

callbacks = []
if "NEPTUNE_API_TOKEN" in os.environ and os.environ["NEPTUNE_API_TOKEN"] != "":
    neptune_api_token = os.environ["NEPTUNE_API_TOKEN"]
    run = neptune.init_run(
        project=os.environ["NEPTUNE_PROJECT"], api_token=neptune_api_token
    )
    neptune_monitor = transformers.integrations.NeptuneCallback(
        run=run, log_parameters=False
    )
    callbacks.append(neptune_monitor)

In [None]:
from transformers.trainer_pt_utils import get_parameter_names
import bitsandbytes as bnb
from torch import nn

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_arguments.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_arguments.adam_beta1, training_arguments.adam_beta2),
    "eps": training_arguments.adam_epsilon,
}
optimizer_kwargs["lr"] = training_arguments.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_arguments.adam_beta1, training_arguments.adam_beta2),
    eps=training_arguments.adam_epsilon,
    lr=training_arguments.learning_rate,
)

In [None]:

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    optimizers=(adam_bnb_optim, None),
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    callbacks=callbacks,
)

# Train model
trainer.train(resume_from_checkpoint=True)

# Save trained model
trainer.model.save_pretrained(config.new_model)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using t

Step,Training Loss
1,1.1869
2,1.1341
3,1.05
4,1.1146
5,1.1021
6,1.081
7,1.3247
8,1.317
9,1.1557
10,1.1719


Step,Training Loss
1,1.1869
2,1.1341
3,1.05
4,1.1146
5,1.1021
6,1.081
7,1.3247
8,1.317
9,1.1557
10,1.1719


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: ignored

In [None]:
# Free memory for merging weights
# del model
# torch.cuda.empty_cache()

# model = AutoPeftModelForCausalLM.from_pretrained(
#     '/content/gdrive/MyDrive/kalapa/results/checkpoint-250', device_map="auto", torch_dtype=torch.bfloat16
# )
# model = model.merge_and_unload()



In [None]:
# model.push_to_hub('naot97/medical', use_temp_dir=True)
# tokenizer.push_to_hub('naot97/medical', use_temp_dir=True)

In [None]:
!mv medical /content/gdrive/MyDrive/kalapa