# 導入環境

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]
!pip install peft
!pip install pandas pyarrow
!pip install -U bitsandbytes
!pip install transformers datasets
!apt-get install wget

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# 載入必要套件
import bitsandbytes
import gc
import os
import pandas as pd
import torch
from datasets import Dataset
from google.colab import drive, output
from huggingface_hub import login, HfApi
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig, AutoModelForSequenceClassification, AdamW,  get_linear_schedule_with_warmup

In [None]:
#下載資料集
!wget https://huggingface.co/datasets/ChenWeiLi/Medtext_zhtw/raw/main/MedText_zhtw.json

--2024-08-12 03:12:32--  https://huggingface.co/datasets/ChenWeiLi/Medtext_zhtw/raw/main/MedText_zhtw.json
Resolving huggingface.co (huggingface.co)... 18.172.134.124, 18.172.134.4, 18.172.134.24, ...
Connecting to huggingface.co (huggingface.co)|18.172.134.124|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 983900 (961K) [text/plain]
Saving to: ‘MedText_zhtw.json’


2024-08-12 03:12:32 (14.9 MB/s) - ‘MedText_zhtw.json’ saved [983900/983900]



In [None]:
#連結到 google drive(可選)
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 替換為你的Hugging Face API Token
my_read_token = "貼上你自己的 read token"
os.environ["HUGGINGFACE_TOKEN"] = my_read_token
# 登錄Hugging Face
login(token=os.environ["HUGGINGFACE_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# 處理資料集


In [None]:
# 將 JSON 文件轉換為 CSV 文件。
df = pd.read_json('/content/MedText_zhtw.json' )
ds = Dataset.from_pandas(df)

In [None]:
model_name = "taide/TAIDE-LX-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, token=my_write_token)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/813k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

In [None]:
def process_func(example):
    MAX_LENGTH = 384  # Llama 分詞器會將一個中文字切割成多個 token，因此需要放開一些最大長度，確保資料的完整性
    input_ids, attention_mask, labels = [], [], []

    # 構建 instruction 和 input 字符串，並進行分詞
    instruction = tokenizer(f"user\n\n{example['instruction'] + example['input']}assistant\n\n", add_special_tokens=False)
    response = tokenizer(f"{example['output']}", add_special_tokens=False)

    # 合併 input 和 response 的 token ID 和注意力掩碼
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]

    # 如果超過最大長度，進行截斷
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    # 確保 attention_mask 和 labels 與 input_ids 一致
    attention_mask = attention_mask[:len(input_ids)]
    labels = labels[:len(input_ids)]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# 應用到整個數據集
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)

# 檢查是否正確
print(tokenized_ds)
print(tokenizer.decode(tokenized_ds[0]['input_ids']))
print(tokenizer.decode([token for token in tokenized_ds[0]["labels"] if token != -100]))


Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1412
})
user

你是一位專業的醫療人員,請用心且專業的回答問題。一名 50 歲男性有復發性腎結石和骨質減少病史。由於先前診斷出維生素 D 缺乏症，他一直在服用大劑量的維生素 D 補充劑。實驗室結果顯示高血鈣症和高鈣尿症。可能的診斷是什麼，治療方法是什麼？assistant

 該患者有復發性腎結石、骨質減少和大劑量維生素 D 補充劑病史，以及高鈣血症和高鈣尿症的實驗室檢查結果，暗示維生素 D 中毒的可能性。過量攝取維生素 D 會造成腸道對鈣的吸收增加，導致高鈣血症和高鈣尿症，及腎結石和骨質流失。治療包括停止補充維生素 D，並可能提供靜脈輸液和袢利尿劑以促進鈣的排泄。
該患者有復發性腎結石、骨質減少和大劑量維生素 D 補充劑病史，以及高鈣血症和高鈣尿症的實驗室檢查結果，暗示維生素 D 中毒的可能性。過量攝取維生素 D 會造成腸道對鈣的吸收增加，導致高鈣血症和高鈣尿症，及腎結石和骨質流失。治療包括停止補充維生素 D，並可能提供靜脈輸液和袢利尿劑以促進鈣的排泄。


# 創建模型

In [None]:
# 重新開始加載模型
model = AutoModelForCausalLM.from_pretrained("taide/TAIDE-LX-7B", device_map="auto",torch_dtype=torch.bfloat16)
# 從檢查點載入模型- 可參考的寫法
#model_checkpoint = '/content/drive/MyDrive/colab_results/checkpoint-38500'  # 修改為最新的檢查點路徑
#model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)


# 設定 pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# 設定 eos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# 檢查是否正確
model

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(56064, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [None]:
model.enable_input_require_grads() # 開啟梯度檢查點時，要執行方法

# lora

In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 訓練模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alapa，具體作用參見 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'up_proj', 'q_proj', 'v_proj', 'o_proj', 'down_proj', 'k_proj', 'gate_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [None]:
model = get_peft_model(model, config)
# 檢查是否正確
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='taide/TAIDE-LX-7B', revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'up_proj', 'q_proj', 'v_proj', 'o_proj', 'down_proj', 'k_proj', 'gate_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [None]:
model.print_trainable_parameters()

trainable params: 19,988,480 || all params: 6,955,536,384 || trainable%: 0.2874


# 配置训练参数

In [None]:
torch.utils.checkpoint.use_reentrant = False
# TODO: 之後改成使用optuna自動調整超參
# 設定訓練參數
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/colab_results",
    save_steps=50,  # 每50步保存一次檢查點
    logging_dir='/content/drive/MyDrive/colab_logs',
    per_device_train_batch_size=64,  # 增加批次大小
    gradient_accumulation_steps=4,  # 調整累積梯度步數，使得實際批次大小達到 260K tokens
    logging_steps=10,
    num_train_epochs=40,
    learning_rate=5e-5,
    save_strategy="steps",  # 每 steps 次保存一次
    save_total_limit=5,  # 保留最多5個檢查點
    gradient_checkpointing=True,
    weight_decay=0.01,
)

# 設定優化器
optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

# 設定學習率調度器
num_training_steps = len(tokenized_ds) // (args.per_device_train_batch_size * args.gradient_accumulation_steps) * args.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 預熱步數，這裡設定為總步數的10%
    num_training_steps=num_training_steps
)

# 創建 Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    optimizers=(optimizer, lr_scheduler),  # 設定優化器和學習率調度器
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    tokenizer =tokenizer
)





In [None]:
trainer.train()

# 繼續訓練
#def forward_with_checkpoint(*args, **kwargs):
#    return torch.utils.checkpoint.checkpoint(trainer.training_step, *args, use_reentrant=False, **kwargs)

#trainer.training_step = forward_with_checkpoint
#trainer.train(resume_from_checkpoint=True)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.2668
20,2.1362
30,1.9636
40,1.8531
50,1.7647
60,1.7018
70,1.6232
80,1.5372
90,1.4673
100,1.3866




TrainOutput(global_step=200, training_loss=1.41949134349823, metrics={'train_runtime': 4305.7062, 'train_samples_per_second': 13.117, 'train_steps_per_second': 0.046, 'total_flos': 4.305480986610893e+17, 'train_loss': 1.41949134349823, 'epoch': 34.78260869565217})

# 儲存 LoRA 和 tokenizer 結果


In [21]:
peft_model_id="./llama3_lora"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('./llama3_lora/tokenizer_config.json',
 './llama3_lora/special_tokens_map.json',
 './llama3_lora/tokenizer.model',
 './llama3_lora/added_tokens.json')

# 載入 lora 權重推理

In [22]:
print("tokenizer.pad_token = "+tokenizer.pad_token)

print("tokenizer.eos_token = "+tokenizer.eos_token)

tokenizer.pad_token = <pad>
tokenizer.eos_token = </s>


In [23]:
gc.collect()
torch.cuda.empty_cache()
prompt = "每天只睡三小時會有啥狀況？"
messages = [
    {"role": "system", "content": "你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。"},
    {"role": "user", "content": prompt}
]

# 將消息合併成一個文本輸入
def format_messages(messages):
    formatted_messages = ""
    for message in messages:
        if message['role'] == 'system':
            formatted_messages += f"[SYSTEM] {message['content']}\n"
        elif message['role'] == 'user':
            formatted_messages += f"[USER] {message['content']}\n"
    return formatted_messages

# 格式化消息
formatted_text = format_messages(messages)
print("Formatted text:", formatted_text)

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
print("Generated text with chat template:", text)


model_inputs = tokenizer([text], return_tensors="pt").to('cuda')

# 打印model_inputs進行檢查
print("Model inputs:", model_inputs)

# 產生文本
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=90,
    #eos_token_id=tokenizer.encode('<|eot_id|>')[0],
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    attention_mask=model_inputs.attention_mask,
repetition_penalty=1.6,  # 增加 repetition_penalty
    top_k=50,  # 設定最高k個概率詞
    # stop_token=tokenizer.eos_token,  # 确保模型在遇到结束标记时停止
    do_sample=True,
    top_p=0.15,  # 新增 top_p 參數
    temperature=0.15,  # 設定 temperature 參數
    #forced_eos_token_id=tokenizer.encode('</s>')[0]
)

# 打印generated_ids進行檢查
print("Generated IDs:", generated_ids)
# 去掉輸入部分，僅保留生成的文本
generated_ids = generated_ids[:, model_inputs.input_ids.shape[-1]:]


response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)


# 清理生成的文本
import re
response = re.sub(r'</s>.*', '', response, flags=re.DOTALL)
response = re.sub(r'</s>.*', '</s>', response)
response = re.sub(r'\[.*?\]', '', response)  # 去除方括號內的內容
response = re.sub(r'</?[^>]+>', '', response)  # 去除HTML標籤
response = re.sub(r'dress|dressing', '', response, flags=re.IGNORECASE)  # 去除 "dress" 和 "dressing"
response = re.sub(r'<<.*?>>', '', response)  # 去除 `<<SYS>>` 標籤
response = re.sub(r'\[.*?\]', '', response)  # 去除 `[/EMBB]]` 標籤
response = response.strip()  # 去除前后多余的空格
# 从最后开始查找句号并去除句号之后的语句
def remove_after_last_period(text):
    last_period_index = max(text.rfind('。'), text.rfind('!'))
    if last_period_index != -1:
        return text[:last_period_index + 1]
    return text

# 清理生成的文本
response = remove_after_last_period(response).strip()

print(response)

Formatted text: [SYSTEM] 你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。
[USER] 每天只睡三小時會有啥狀況？

Generated text with chat template: <s>[INST] <<SYS>>
你是一位專業的醫療人員，請用心且專業的以三到五句話回答問題。
<</SYS>>

每天只睡三小時會有啥狀況？ [/INST]
Model inputs: {'input_ids': tensor([[    1,     1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,
         33013, 32052, 37319, 52781, 32701, 30214, 50772, 44775, 32350, 37319,
         30651, 30457, 30780, 30904, 34694, 35616, 35211, 30267,    13, 29966,
           829, 14816, 29903,  6778,    13,    13, 42265, 31557, 45710, 30457,
         37374, 40939, 35236, 44124, 30882,   518, 29914, 25580, 29962]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')}




Generated IDs: tensor([[    1,     1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,    13,
         33013, 32052, 37319, 52781, 32701, 30214, 50772, 44775, 32350, 37319,
         30651, 30457, 30780, 30904, 34694, 35616, 35211, 30267,    13, 29966,
           829, 14816, 29903,  6778,    13,    13, 42265, 31557, 45710, 30457,
         37374, 40939, 35236, 44124, 30882,   518, 29914, 25580, 29962, 29871,
         45711, 32336, 31411, 37362, 50691, 30503, 38796, 53952, 30330, 42602,
         30413, 53999, 32827, 33813, 33388, 38424, 30419, 33084, 55042, 36182,
         30409, 31184, 35560, 38367, 53628, 33389, 38792, 30267, 36557, 39567,
         37016, 31391, 37334, 44739, 52244, 52819, 35647, 45394, 34113, 54098,
         52270, 50614, 32373, 34744, 51949, 50584, 47240, 38513, 37054, 39228,
         37318, 32703, 52283, 42541, 50811, 30267, 43711, 33327, 42600, 45322,
         42263, 30210, 47976, 31074, 42400, 32262, 31608, 40973, 34725, 38454,
         41633, 30898, 44951, 30275, 

In [24]:
model.save_pretrained("/hfmodel")
tokenizer.save_pretrained("/hftokenizer")

('/hftokenizer/tokenizer_config.json',
 '/hftokenizer/special_tokens_map.json',
 '/hftokenizer/tokenizer.model',
 '/hftokenizer/added_tokens.json')

In [25]:

os.environ["HUGGINGFACE_UPLOAD_TOKEN"] = "貼上你自己的 write token"
# 登錄Hugging Face
login(token=os.environ["HUGGINGFACE_UPLOAD_TOKEN"])

api = HfApi()
api.upload_folder(
    folder_path="/hfmodel",  # 本地保存模型的目錄
    path_in_repo="",  # 默認值會將文件上傳到根目錄
    repo_id="你的帳號/你的model上傳地",  # Hugging Face 上的模型名稱 like mark1098/TAIDE-LX-7B-Chat-Medical-Fintune
    repo_type="model"  # 上傳的是模型
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


CommitInfo(commit_url='https://huggingface.co/mark1098/TAIDE-LX-7B-Chat-Medical-Fintune/commit/d74fa333d6cb5fed658681deff328810a691d9b4', commit_message='Upload folder using huggingface_hub', commit_description='', oid='d74fa333d6cb5fed658681deff328810a691d9b4', pr_url=None, pr_revision=None, pr_num=None)