# This Notebook can continuous pretrain selected LLM on custom dataset

In [1]:
from os import listdir
from os.path import isfile, join

In [2]:
data_dir = "../dataset/textbook/ch"
files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
files = [f for f in files if f.endswith(".txt")]
files = sorted(files)

In [3]:
from transformers import AutoModelForCausalLM , AutoTokenizer
import pandas as pd

sentence_length = 30

model_id = "taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
raw_dataset = []
for file in files:
    file_content = open(join(data_dir, file)  , "r").readlines()
    for line in file_content:
        for start in range(0 , len(line) , sentence_length):
            raw_dataset.append({"text":line[start: min(start+sentence_length , len(line))]})


  from .autonotebook import tqdm as notebook_tqdm


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from datasets import Dataset
dataset = Dataset.from_list(raw_dataset)
dataset = dataset.train_test_split(test_size=0.01)


def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset["train"].column_names,
)

total_token = 0
for example in tokenized_dataset["train"]:
    total_token += len(example["input_ids"])
print("Training token count:",total_token)


Map (num_proc=4): 100%|██████████| 21792/21792 [00:00<00:00, 23554.99 examples/s]
Map (num_proc=4): 100%|██████████| 221/221 [00:00<00:00, 974.99 examples/s]


Training token count: 950630


In [5]:
block_size = 1024

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)


Map (num_proc=4):   0%|          | 0/21792 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 21792/21792 [00:01<00:00, 17721.70 examples/s]
Map (num_proc=4): 100%|██████████| 221/221 [00:00<00:00, 1275.66 examples/s]


In [6]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [7]:
# out of VRAM: https://huggingface.co/docs/transformers/en/tasks/language_modeling
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from peft import prepare_model_for_kbit_training

In [8]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [9]:

device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                      device_map=device_map,
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)



Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it]


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
original_model = prepare_model_for_kbit_training(original_model)


In [12]:
from peft import LoraConfig, get_peft_model
import peft
config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)

In [13]:
from  datetime import datetime
output_dir = f'./peft-ch_textbook-{datetime.now()}'
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
    num_train_epochs=1
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset= lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    args=peft_training_args,
    data_collator=data_collator,
)

In [14]:
peft_trainer.train()



Step,Training Loss,Validation Loss
25,3.5979,3.286719
50,3.1781,3.092023
75,3.056,3.019609
100,2.9894,2.971016
125,2.9252,2.932739
150,2.8961,2.909793
175,2.8822,2.890236
200,2.8783,2.878939
225,2.8632,2.873835




TrainOutput(global_step=229, training_loss=3.0272344980697965, metrics={'train_runtime': 3030.1864, 'train_samples_per_second': 0.302, 'train_steps_per_second': 0.076, 'total_flos': 4.234321871241216e+16, 'train_loss': 3.0272344980697965, 'epoch': 1.0})

In [28]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(original_model, "./peft-ch_textbook-2024-05-15 14:26:47.473686/checkpoint-225",torch_dtype=torch.float16,is_trainable=False)
peft_model.eval()
eval_dataset = Dataset.from_pandas(pd.read_csv("../dataset/gsat/113_chinese.csv"))
question = eval_dataset[19]


In [40]:
messages = [
            {"role": "system",
                "content": """你是一個用於解決臺灣高中生升學考試選擇題的 AI 助理，請依據邏輯推理及高中程度的知識選出正確的答案。                           """},
            {"role": "user", "content": """
                           請你幫我回答高中的學測題目，題目分為國文、英文、數學、自然、社會五個科目，題目可為單選題或多選題。
                           範例一：

                           全球主要有三大地震帶，臺灣位於其中的「環太平洋地震帶」上。下列有關此地震帶的敘述何者正確？此題為多選題，
                           (A)此地震帶的形成主要與張裂性板塊邊界有關
                           (B)地震主要發生在地殼中，所以此地震帶特徵多淺源地震
                           (C)此地震帶與環太平洋火山帶（火環）位置幾乎一致，有許多活火山
                           (D)地震與斷層活動息息相關，此地震帶的地震多半是由平移斷層活動造成
                           (E)臺灣位在此地震帶上，表示臺灣島與太平洋板塊相接
                           
                           輸出格式：
                           正確的答案：[填入正確的選項]
                           解釋：
                           [填入解釋]
                           """},
            {"role":"assistant","content":"""
                           正確的答案：(A)、(D)、(E)
                           解釋：
                           (A)環太平洋地震帶主要為聚合型板塊邊界，例如臺灣、馬里亞納海溝、日本。
                           (B)由於是聚合型板塊邊界，板塊有隱沒作用，地震震源應該由淺到深都有。
                           (D)承(B)，聚合擠壓作用為主的地區，其應力會以壓力為主，斷層多為逆斷層。
                           (E)位於此地震帶不等於與太平洋板塊相接，臺灣位於歐亞板塊及菲律賓海板塊的交界。
                           因此選(A)(D)(E)
                           
             """},
            {"role": "user", "content": f"""
                           接下來請針對以下問題，選出正確的選項，此題為{"單選題" if len(question["answer"])==1 else "多選題"}。請問"""+question["question"]+"？"+"""
                           輸出格式：
                           正確的答案：[填入正確的選項]
                           解釋：
                           [填入解釋]
                           """},
        ]
prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
)
terminators = [
            tokenizer.eos_token_id
]

inputs = tokenizer(prompt , return_tensors="pt")
print(inputs)

{'input_ids': tensor([[128000, 128006,   9125,  ...,  78191, 128007,    271]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}


In [41]:
with torch.no_grad():
    outputs = peft_model.generate(input_ids=inputs["input_ids"], max_new_tokens=512)
    peft_model_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{question["question"]}')
print(dash_line)
print(f'PEFT MODEL:\n{peft_model_output}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
情緒訊號系統有個特色，就是它總是「開啟」的。情緒開始時，幾乎會立刻出現訊號。例如傷心時，眉毛內緣會向上拉，聲音會變得較輕柔低沉。最立即的情緒訊號是臉部表情。人們常把生氣、害怕、嫌惡、哀傷和輕蔑視為負面情緒，驚訝和愉快是正面情緒。這簡單的二分法有兩個問題，一是忽略所謂負面情緒並不總是令人不舒服，如許多人看了一場哀傷的電影後，喜歡大哭一場。二是被視為正向情緒的快樂，也可能很殘忍，如嘲笑別人。唯有檢視各種情緒事件的特性，才能分辨使人愉快還是不愉快。 聲音是另一種情緒訊號系統，與臉部表情有許多有趣的差別。臉部是可以觀察的，聲音則是時斷時續的，可以憑意志完全關閉。想隱藏表情，可能是人常以電話取代面對面溝通的原因。電子郵件甚至不需要說和聽，所以不會從聲音流露情緒。 身體動作也是情緒訊號：輕鬆時會出現放鬆身體的姿勢，輕蔑的動作是從上往下看對方，驚訝則是把注意力固定在產生情緒的對象。身體動作雖然像臉部和聲音的情緒訊號一樣是不由自主的，但對大多數人而言，控制身體的動作，比完全不露出臉部和聲音的情緒訊號更為容易。（改寫自保羅・艾克曼《心理學家的面相術》）某些心理學家認為：情緒調整模式可藉學習而來。下列最接近此一觀點的是： (A)人稟七情，應物斯感，感物吟志，莫非自然 (B)真者，所以受於天也，自然不可易也。故聖人法天貴真，不拘於俗 (C)登山則情滿於山，觀海則意溢於海，我才之多少，將與風雲而並驅矣 (D)聖人所以治人七情，修十義，講信修睦，尚辭讓，去爭奪，舍禮何以治之  
---------------------------------------------------------------------------------------------------
PEFT MODEL:
system

你是一個用於解決臺灣高中生升學考試選擇題的 AI 助理，請依據邏輯推理及高中程度的知識選出正確的答案。user

請你幫我回答高中的學測題目，題目分為國文、英文、數學、自然、社會五個科目，題目可為單選題或多選題。
                      