In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets transformers peft fire

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fire
  Downloading fire-0.5.0.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
import os; os.environ["CUDA_DEVICE"] = os.environ.get("CUDA_DEVICE") or "0"

import sys
import fire
import torch
import transformers
from datasets import load_dataset

from peft import (
    TaskType,
    LoraConfig,
    PrefixTuningConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dir_data = ["/content/drive/MyDrive/Colab Notebooks/data/vi_alpaca_reduced.jsonl"]

In [None]:
data_path: str = "/content/drive/MyDrive/Colab Notebooks/data/vi_alpaca_reduced.jsonl"
base_model: str = "VietAI/gpt-neo-1.3B-vietnamese-news"
output_dir: str = "./chat-gpt-neo-1.3B"
# base_model: str = "VietAI/gpt-j-6B-vietnamese-news",
# output_dir: str = "./chat-gpt-j-6B-1e",

# training hyperparams
batch_size: int = 128
micro_batch_size: int = 2
num_epochs: int = 1
learning_rate: float = 3e-4
cutoff_len: int = 256
val_set_size: int = 0

## Select finetune method
finetune_method: str = "lora" # lora prefix

# prefix tuning hyperparams
# Tham khảo https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb
num_virtual_tokens: int = 32

# lora hyperparams
lora_r: int = 6
lora_alpha: int = 12
lora_dropout: float = 0.05
lora_target_modules: str = "q_proj k_proj v_proj" # gpt-3

# llm hyperparams
bf16: bool = False # whether to use bf16 (preferred on A100's).
load_in_8bit: bool = True # 8 bit sẽ giảm vram nhưng chậm tốc độ huấn luyện đi nhiều lần
group_by_length: bool = False  # faster, but produces an odd training loss curve
resume_from_checkpoint: str = None  # either training checkpoint or final adapter

# In ra các tham số chung
print("\nFINE-TUNE METHOD:", finetune_method)
print(
    f"Mô hình được finetune và các tham số chung:\n"
    f"base_model: {base_model}\n"
    f"data_path: {data_path}\n"
    f"output_dir: {output_dir}\n"
    f"batch_size: {batch_size}\n"
    f"micro_batch_size: {micro_batch_size}\n"
    f"num_epochs: {num_epochs}\n"
    f"learning_rate: {learning_rate}\n"
    f"cutoff_len: {cutoff_len}\n"
    f"val_set_size: {val_set_size}\n"
    f"group_by_length: {group_by_length}\n"
    f"resume_from_checkpoint: {resume_from_checkpoint}\n"
)


FINE-TUNE METHOD: lora
Mô hình được finetune và các tham số chung:
base_model: VietAI/gpt-neo-1.3B-vietnamese-news
data_path: /content/drive/MyDrive/Colab Notebooks/data/vi_alpaca_reduced.jsonl
output_dir: ./chat-gpt-neo-1.3B
batch_size: 128
micro_batch_size: 2
num_epochs: 1
learning_rate: 0.0003
cutoff_len: 256
val_set_size: 0
group_by_length: False
resume_from_checkpoint: None



In [None]:
if finetune_method == "lora":
  config = LoraConfig(
      r=lora_r,
      lora_alpha=lora_alpha,
      target_modules=lora_target_modules.split(), # phân tách str thành list
      lora_dropout=lora_dropout,
      bias="none",
      task_type=TaskType.CAUSAL_LM,
  )
  print(
      f"Training LoRA model with params:\n"
      f"lora_r: {lora_r}\n"
      f"lora_alpha: {lora_alpha}\n"
      f"lora_dropout: {lora_dropout}\n"
      f"lora_target_modules: {lora_target_modules}\n"
  )
elif finetune_method == "prefix":
  config = PrefixTuningConfig(
      task_type=TaskType.CAUSAL_LM,
      num_virtual_tokens=num_virtual_tokens
  )
  print(
      f"Training Prefix-tuning model with params:\n"
      f"num_virtual_tokens: {num_virtual_tokens}\n"
  )

assert (
    base_model
), "Please specify a --base_model, e.g. --base_model='VietAI/gpt-j-6B-vietnamese-news'"


Training LoRA model with params:
lora_r: 6
lora_alpha: 12
lora_dropout: 0.05
lora_target_modules: q_proj k_proj v_proj



In [None]:
gradient_accumulation_steps = batch_size // micro_batch_size
if load_in_8bit: bf16 = False # nếu load 8 bit thì buộc phải dùng bf16
device_map = {"": int(os.environ.get("CUDA_DEVICE") or 0)}
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1

if ddp: # huấn luyện đa GPUs
  device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
  gradient_accumulation_steps = gradient_accumulation_steps // world_size

model = AutoModelForCausalLM.from_pretrained(
    base_model,
   # load_in_8bit=load_in_8bit,
    torch_dtype=torch.float16,
    device_map=device_map,
)

if finetune_method == "lora":
  print(model.state_dict) # in ra model state để lựa chọn cho lora

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
tokenizer.padding_side = "left"  # Allow batched inference

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/5.44G [00:00<?, ?B/s]

<bound method Module.state_dict of GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(60000, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTNeoBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
          )
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=2048, out_features=8

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

In [None]:
def tokenize(prompt, add_eos_token=True):
  result = tokenizer(
    prompt,
    truncation=True,
    max_length=cutoff_len,
    padding=False,
    return_tensors=None,
  )
  if (
  result["input_ids"][-1] != tokenizer.eos_token_id
  and len(result["input_ids"]) < cutoff_len
  and add_eos_token
  ):
    result["input_ids"].append(tokenizer.eos_token_id)
    result["attention_mask"].append(1)

  result["labels"] = result["input_ids"].copy()

  return result

In [None]:
def make_prompt(instruction):
        return f"""Hãy viết một phản hồi thích hợp cho chỉ dẫn dưới đây.

### Instruction:
{instruction}

### Response:"""
# END generate_qna_prompt

In [None]:
def generate_prompt(data_point):
  question = data_point["prompt"].strip()
  answer = data_point["response"].strip()
  return f"{make_prompt(question)}\n{answer}"

In [None]:
def generate_and_tokenize_prompt(data_point):
  full_prompt = generate_prompt(data_point)
  return tokenize(full_prompt)

model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)



In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 0
all model parameters: 1337298944
percentage of trainable model parameters: 0.00%


In [None]:
if data_path.endswith(".jsonl"):
  data = load_dataset("json", data_files=data_path)
else:
  data = load_dataset(data_path)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
if resume_from_checkpoint:
# Check the available weights and load them
  checkpoint_name = os.path.join(
      resume_from_checkpoint, "pytorch_model.bin"
  )  # Full checkpoint
  if not os.path.exists(checkpoint_name):
      checkpoint_name = os.path.join(
          resume_from_checkpoint, "adapter_model.bin"
      )  # only LoRA model - LoRA config above has to fit
      resume_from_checkpoint = (
          False  # So the trainer won't try loading its state
      )
  # The two files above have a different name depending on how they were saved, but are actually the same.
  if os.path.exists(checkpoint_name):
      print(f"Restarting from {checkpoint_name}")
      adapters_weights = torch.load(checkpoint_name)
      set_peft_model_state_dict(model, adapters_weights)
  else:
      print(f"Checkpoint {checkpoint_name} not found")

In [None]:
if val_set_size > 0:
  train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42)
  train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt)
  val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt)
else:
  train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
  val_data = None

Map:   0%|          | 0/51092 [00:00<?, ? examples/s]

In [None]:
training_args = transformers.TrainingArguments(
  fp16=(not bf16), # tốt cho GPUs đời cũ và training 8-bit
  bf16=bf16, # tốt cho GPUs đời mới và không dùng 8-bit
  per_device_train_batch_size=micro_batch_size,
  gradient_accumulation_steps=gradient_accumulation_steps,
  warmup_steps=100,
  num_train_epochs=num_epochs,
  learning_rate=learning_rate,
  logging_steps=10,
  optim="adamw_torch",
  evaluation_strategy="steps" if val_set_size > 0 else "no",
  save_strategy="steps",
  eval_steps=200 if val_set_size > 0 else None,
  save_steps=200,
  output_dir=output_dir,
  save_total_limit=3,
  load_best_model_at_end=True if val_set_size > 0 else False,
  ddp_find_unused_parameters=False if ddp else None,
  group_by_length=group_by_length,
  report_to="none", # không sử dụng wandb (default option)
  run_name=None,
)

In [None]:
trainer = transformers.Trainer(
  model=model,
  train_dataset=train_data,
  eval_dataset=val_data,
  args=training_args,
  data_collator=transformers.DataCollatorForSeq2Seq(
      tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
  ),
)
model.config.use_cache = False

In [None]:
if torch.__version__ >= "2" and sys.platform != "win32":
  model = torch.compile(model)

In [None]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model = model.to("cuda")
model.save_pretrained(output_dir)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.751
20,3.6129
30,3.3672
40,2.9625
50,2.5449
60,2.2975
70,2.1574
80,2.0226
90,1.9222
100,1.869


Step,Training Loss
10,3.751
20,3.6129
30,3.3672
40,2.9625
50,2.5449
60,2.2975
70,2.1574
80,2.0226
90,1.9222
100,1.869


In [None]:
from peft import PeftModel

In [None]:
BASE_MODEL = "VietAI/gpt-neo-1.3B-vietnamese-news"
PEFT_WEIGHTS = "/content/chat-gpt-neo-1.3B"
load_in_8bit = False

In [None]:
if torch.cuda.is_available():
  device = "cuda"
  device_map = {'': 0}
  if load_in_8bit:
      model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, load_in_8bit=True, torch_dtype=torch.float16, device_map=device_map)
      model = PeftModel.from_pretrained(model, PEFT_WEIGHTS, torch_dtype=torch.float16, device_map=device_map)
  else:
      model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map=device_map)
      model = PeftModel.from_pretrained(model, PEFT_WEIGHTS, device_map=device_map)
else:
  device = "cpu"
  model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
  model = PeftModel.from_pretrained(model, PEFT_WEIGHT)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model.eval()
if torch.__version__ >= "2": # tăng tốc
    model = torch.compile(model)

In [None]:
def get_answer(q, max_new_tokens=196, skip_tl=False):
  input_ids = tokenizer(make_prompt(q), return_tensors="pt")["input_ids"].to(device)
  with torch.no_grad():
      gen_tokens = model.generate(
          input_ids=input_ids,
          max_length=len(input_ids) + max_new_tokens,
          do_sample=True,
          temperature=0.5,
          top_k=20,
          repetition_penalty=1.2,
          eos_token_id=0, # for open-end generation.
          pad_token_id=tokenizer.eos_token_id,
      )
  origin_output = tokenizer.batch_decode(gen_tokens)[0]
  output = origin_output.split("###")[2]
  try:
      k = output.index(":")
      if k < 10: output = output[k+1:]
  except:
      output = output
  # print(f"\n- - -{origin_output}- - -\n")
  return output.strip()

In [None]:
print("\n")
while True:
  query = input("\nBạn：")
  print(f"Bot: {get_answer(query)}")




Bạn：Cho hai ví dụ về một chất lỏng.
Bot: Một chất lỏng là một chất rắn có thể được tìm thấy trong nước. Nó có thể được tìm thấy trong nước biển, nước mưa, nước sông, nước suối và nước ngầm.<|endoftext|>


KeyboardInterrupt: ignored