# Install libraries

In [1]:
! pip install -q -U bitsandbytes
! pip install -q -U datasets
! pip install -q -U git+https://github.com/huggingface/transformers.git
! pip install -q -U git+https://github.com/huggingface/peft.git
! pip install -q -U git+https://github.com/huggingface/accelerate.git
! pip install -q -U loralib
! pip install -q -U einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# Import libraries

In [12]:
import json
import os
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
from datasets import Dataset

from pprint import pprint
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import(
  LoraConfig,
  PeftConfig,
  PeftModel,
  get_peft_model,
  prepare_model_for_kbit_training
)
from transformers import(
  AutoConfig,
  AutoModelForCausalLM,
  AutoTokenizer,
  BitsAndBytesConfig
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Initialize model

In [13]:
MODEL_NAME = "vilm/vinallama-7b-chat"

bnb_config = BitsAndBytesConfig(
  load_in_4bit =True ,
  bnb_4bit_use_double_quant =True ,
  bnb_4bit_quant_type ="nf4",
  bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
  MODEL_NAME ,
  device_map = "auto",
  trust_remote_code =True,
  quantization_config = bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
  r=16,
  lora_alpha =32,
  target_modules =[
  "q_proj",
  "up_proj",
  "o_proj",
  "k_proj",
  "down_proj",
  "gate_proj",
  "v_proj"
  ],
  lora_dropout =0.05,
  bias ="none",
  task_type ="CAUSAL_LM"
)

model = get_peft_model(model, config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.67M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Build fine-tuning dataset

## Download dataset

In [14]:
data = load_dataset('hllj/vi_grade_school_math_mcq')

Downloading readme:   0%|          | 0.00/2.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Build prompt function

In [15]:
def generate_prompt(question, choices, explanation):
  return f"""
  <|im_start|>system
  Bạn là một chuyên gia về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.

  <|im_start|>user
  ### Câu hỏi:
  {question}
  ### Các lựa chọn:
  {choices}
  ### Câu trả lời:
  <|im_start|>assistant
  {explanation}
  """.strip()

def generate_and_tokenize_prompt(question, choices, explanation):
  full_prompt = generate_prompt(question, choices, explanation)
  tokenized_full_prompt = tokenizer(
    full_prompt,
    padding =True,
    truncation = True
  )
  return tokenized_full_prompt

## Apply functions into the dataset

In [16]:
training_samples = []
for sample in tqdm(data['train'].shard(num_shards = 6, index = 0)):
  for quest in sample['problems']:
    choices = quest['choices']
    explanation = quest['explanation'].strip()
    question = quest['question']
    if explanation == '' or question == '' or choices == []:
      continue

    try:
      question = question.split('\n \n')[1].strip()
    except:
      continue
    choices = '\n'.join(choices)
    training_sample = generate_and_tokenize_prompt(
      question , choices , explanation
    )

    training_samples.append(training_sample)

  0%|          | 0/456 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 456/456 [00:00<00:00, 741.85it/s]
  0%|          | 0/456 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 456/456 [00:00<00:00, 727.33it/s]


### Change to Hugging Face dataset

In [17]:
choices_data = Dataset.from_list(training_samples)

In [18]:
choices_data

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1537
})

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1537
})

### Training

In [19]:
training_args = transformers.TrainingArguments(
  per_device_train_batch_size =1,
  gradient_accumulation_steps =4,
  num_train_epochs =1,
  learning_rate =2e-4,
  fp16 =True,
  save_total_limit =3,
  logging_steps =1,
  output_dir = "experiments",
  optim ="paged_adamw_8bit",
  lr_scheduler_type ="cosine",
  warmup_ratio =0.05,
)

trainer = transformers.Trainer(
  model =model,
  train_dataset = choices_data,
  args = training_args,
  data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)
model.config.use_cache = False
trainer.train()



Step,Training Loss
1,3.9047
2,4.1808
3,3.8012
4,4.0158
5,3.8751
6,3.3798
7,3.3855
8,2.9824
9,2.4665
10,2.1419


TrainOutput(global_step=384, training_loss=0.7419027781579643, metrics={'train_runtime': 2663.278, 'train_samples_per_second': 0.577, 'train_steps_per_second': 0.144, 'total_flos': 8459228680962048.0, 'train_loss': 0.7419027781579643, 'epoch': 1.0})



Step,Training Loss
1,3.9047
2,4.1808
3,3.8012
4,4.0149
5,3.8733
6,3.3779
7,3.3819
8,2.9781
9,2.4616
10,2.1344


TrainOutput(global_step=384, training_loss=0.7414045181746284, metrics={'train_runtime': 2661.9617, 'train_samples_per_second': 0.577, 'train_steps_per_second': 0.144, 'total_flos': 8459228680962048.0, 'train_loss': 0.7414045181746284, 'epoch': 1.0})

# Run fine-tuning model

### Set up parameters

In [27]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

### Run model

In [29]:
%%time
device = 'cuda' if torch.cuda.is_available() else 'cpu'

prompt = """
<|im_start|>system
Bạn là một chuyên già về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.
<|im_end|>
<|im_start|>user
### Câu hỏi: Số gồm 1 đơn vị và 2 chục đọc là:
### Các lựa chọn:
A. 20
B. 21
C. 30
D. 31
### Câu trả lời:

<|im_start|>assistant
""".strip()

encoding = tokenizer(prompt , return_tensors ="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
  input_ids = encoding.input_ids,
  attention_mask = encoding.attention_mask,
  generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens = True))

<|im_start|> system
Bạn là một chuyên già về toán. Bạn sẽ nhận câu hỏi trắc nghiệm kèm theo các lựa chọn, hãy giải step by step nếu có và chọn phương án đúng.
 
<|im_start|> user
### Câu hỏi: Số gồm 1 đơn vị và 2 chục đọc là:
### Các lựa chọn:
A. 20
B. 21
C. 30
D. 31
### Câu trả lời:

<|im_start|> assistant
 Đáp
 đáp
 đáp

CPU times: user 3.69 s, sys: 925 ms, total: 4.62 s
Wall time: 7.34 s
