<a href="https://colab.research.google.com/github/lyzno1/lightning_example/blob/main/ruozhiba_gemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 微调Llama 3.1 8B
## huggingface 😀


In [77]:
# 安装必要的库
!pip install transformers datasets accelerate bitsandbytes peft
!pip show peft

Name: peft
Version: 0.12.0
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: sourab@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: 


## 加载模型和Tokenizer

In [78]:
from huggingface_hub import login

# 使用 Hugging Face 的访问令牌登录
login("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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [79]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# 设置最大序列长度
max_seq_length = 2048

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 量化为4-bit
    load_in_8bit=False,  # 不量化为8-bit
    bnb_4bit_compute_dtype=torch.float16  # 将计算数据类型设置为 torch.float16
    # quantization_method="bnb",  # 指定量化方法
    # compute_dtype=torch.float16  # 设置计算时的数据类型
)

peft_model_id = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# 使用AutoModelForCausalLM加载模型
model = AutoModelForCausalLM.from_pretrained(
    peft_model_id,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    quantization_config=quantization_config
)

# 如果需要设置最大序列长度，可以通过配置文件或模型属性进行设置
model.config.max_position_embeddings = max_seq_length

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 看看lzy

In [80]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, set_seed
import torch

set_seed(3407)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
{}
### Input:
{}
### Response:
{}"""

# 设置模型为推理模式
model.eval()

# 准备输入
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "请用中文回答",  # instruction
            "刘震宇是不是一个内蒙古人？",  # input
            "",  # output (初始为空)
        )
    ], return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")

# 使用 TextStreamer 实时打印生成的文本
text_streamer = TextStreamer(tokenizer)

# 生成文本
with torch.no_grad():
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

<bos>Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
请用中文回答
### Input:
刘震宇是不是一个内蒙古人？
### Response:
需要更多信息才能回答这个问题。 
 
请提供更多关于刘震宇的信息，例如：
* 他来自哪里？
* 他是否在内蒙古工作或生活？
* 其他任何相关信息。


 
<end_of_turn>


In [81]:
from transformers import set_seed

set_seed(3407)
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "请用中文回答",  # instruction
            "出生证丢了怎么证明自己出生了",  # input
            "",  # output
        )
    ],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
请用中文回答
### Input:
出生证丢了怎么证明自己出生了
### Response:
出生证丢失确实让人慌，但别担心，有很多方法可以证明你的出生事实。以下是一些建议：

**1. 联系当地公证处：** 
   * 他们可以帮你核实你的出生日期和时间。
   * 他们还可以提供出生证明的替代方案。

**2. 申请出生证明：** 
   * 如果你没有出生证，可以申请一份新的出生证明。
   * 申请需要提供一些证明身份的材料，比如身份证、户口本等。

**3. 联系医院：** 
   * 如果你


## 格式化数据

In [82]:
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
{}
### Input:
{}
### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):

        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}


In [83]:
from datasets import load_dataset
dataset = load_dataset("yyh11/ruozhiba-llama3-tt", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

In [84]:
print(dataset[4])

{'instruction': '为什么没人说ABCD型的成语？🤔', 'output': '这是因为中文成语一般都是四字成语，每个字都有其特定的含义，四个字合在一起构成一个完整的意思。而ABCD型的成语最常见，所以大家不会刻意强调。', 'input': '', 'text': 'Below is an instruction that describes a task, paired with an input that provides further collaboration details.\n\n### Instruction:\n为什么没人说ABCD型的成语？🤔\n### Input:\n\n### Response:\n这是因为中文成语一般都是四字成语，每个字都有其特定的含义，四个字合在一起构成一个完整的意思。而ABCD型的成语最常见，所以大家不会刻意强调。<eos>'}


## Configure

In [85]:
from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# 配置 Lora 适配器
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
              "gate_proj", "up_proj", "down_proj",],
    use_rslora = False,
    loftq_config = None,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model.add_adapter(peft_config)

# 将 Lora 适配器应用到量化模型
# model = get_peft_model(model, lora_config)


In [86]:
max_length = 128
dataset = dataset.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=max_length, padding="max_length", return_tensors="pt"), batched=True)

Map:   0%|          | 0/1496 [00:00<?, ? examples/s]

In [87]:
# dataset[0]
print(len(dataset[0]['input_ids']))
print(len(dataset[0]['text']))

128
188


In [92]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 确定硬件支持的精度格式
use_fp16 = torch.cuda.is_available() and not torch.cuda.is_bf16_supported()
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=60,
    learning_rate=2e-4,
    fp16=use_fp16,
    bf16=use_bf16,
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    # remove_unused_columns=False,
)

# Wrap the model with fp16 or bf16 if needed
if use_fp16:
    model.half()  # Convert model to fp16
elif use_bf16:
    model.bfloat16()  # Convert model to bf16

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

max_steps is given, it will override any value given in num_train_epochs


## Training Model

In [93]:
trainer_stats = trainer.train()

Step,Training Loss
1,3.7796
2,3.6053
3,3.6458
4,3.406
5,3.4071
6,2.9847
7,3.0276
8,2.6342
9,2.3323
10,2.3023




## Test model

In [4]:
import os

output_dir = "outputs"
checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint-")]
latest_checkpoint = max(checkpoints, key=os.path.getctime)

print("Latest checkpoint directory:", latest_checkpoint)

Latest checkpoint directory: outputs/checkpoint-60


In [5]:
import torch
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
{}
### Input:
{}
### Response:
{}"""

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 量化为4-bit
    load_in_8bit=False,  # 不量化为8-bit
    bnb_4bit_compute_dtype=torch.float16  # 将计算数据类型设置为 torch.float16
    # quantization_method="bnb",  # 指定量化方法
    # compute_dtype=torch.float16  # 设置计算时的数据类型
)

model = AutoModelForCausalLM.from_pretrained(
    latest_checkpoint,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(latest_checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# test model
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer

set_seed(3407)

# 设置模型为推理模式
model.eval()

# 准备输入
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "请用中文回答",  # instruction
            "怎么会有风油精这样的滴眼液 擦，眼睛要飘出来了",  # input
            "",  # output (初始为空)
        )
    ], return_tensors="pt"
).to("cuda" if torch.cuda.is_available() else "cpu")

# 使用 TextStreamer 实时打印生成的文本
text_streamer = TextStreamer(tokenizer)

# 生成文本
with torch.no_grad():
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256)

<bos>Below is an instruction that describes a task, paired with an input that provides further collaboration details.

### Instruction:
请用中文回答
### Input:
怎么会有风油精这样的滴眼液 擦，眼睛要飘出来了
### Response:
因为风油精是用于驱蚊、驱虫、驱tick等目的的，而滴眼液是用于治疗眼睛疾病的。所以，风油精的滴眼液不能用来擦眼睛，因为这会对眼睛造成伤害。<eos>
