In [None]:
%pwd

In [None]:
!pip install -r requirements.txt

In [None]:
import torch
print("当前是否有GPU可用：", torch.cuda.is_available())
print("GPU名称：", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "无")


### 数据转换，把"BelleGroup/train_0.5M_CN"格式换成跟QWen一样

In [None]:
import json
from datasets import load_dataset
from tqdm import tqdm


output_file="data/converted_belle.jsonl"
dataset = load_dataset("BelleGroup/train_0.5M_CN", trust_remote_code=True, split="train")

In [None]:
with open(output_file, "w", encoding="utf-8") as fout:
    for example in tqdm(dataset):
        instruction = example.get("instruction", "").strip()
        output = example.get("output", "").strip()

        if not instruction or not output:
            continue

        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": output}
        ]
        fout.write(json.dumps({"messages": messages}, ensure_ascii=False) + "\n")

In [None]:
!head -n 10 data/converted_belle.jsonl

### LoRA微调

##### 1. 选择模型并下载：Qwen1.5-0.5B-Chat
体积小（1.9GB），支持中文，结构和 Qwen 系列保持一致，非常适合 LoRA 微调。

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "Qwen/Qwen1.5-0.5B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True, torch_dtype="auto")

##### 2. 应用 LoRA

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=8,                             # 降维后维度（rank）
    lora_alpha=16,                  # 缩放因子（实际权重乘以 alpha/r）
    target_modules=["c_attn", "q_proj", "v_proj"],  # 指定插入 LoRA 的层名
    lora_dropout=0.05,              # 训练时的 dropout
    bias="none",                    # 不训练 bias
    task_type=TaskType.CAUSAL_LM    # 指定任务类型是语言建模
)
model = get_peft_model(model, lora_config)


##### 3. 使用 transformers 的 Trainer API 来训练加了 LoRA 的模型，先加载数据集

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="data/converted_belle.jsonl", split='train')

##### 4. 构建模板：将 instruction + output 合并为 prompt 格式

In [None]:
def format_example(example):
    messages = example["messages"]
    user_msg = ""
    assistant_msg = ""
    for i in range(len(messages) - 1):
        if messages[i]["role"] == "user" and messages[i+1]["role"] == "assistant":
            user_msg = messages[i]["content"]
            assistant_msg = messages[i+1]["content"]
            break
    return {
        "text": f"### 指令：\n{user_msg}\n\n### 回答：\n{assistant_msg}"
    }


dataset = dataset.map(format_example)

In [None]:
for i in range(3):
    print(dataset[i]["text"])
    print("=" * 50)


##### 5. 对belle数据集分词

In [None]:
def tokenize(example):
    result = tokenizer(
        example["text"],
        max_length=512,
        truncation=True,
        padding="max_length",  # 或 "longest"，但训练时建议统一长度
    )
    result["labels"] = result["input_ids"].copy()  # 这是训练时必须要有的，标签就是语言模型的输入
    return result


tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)


##### 6. 使用 Trainer 开始训练， LoRA微调

Qwen1.5-0.5B-Chat（约5亿参数）；

数据集大小：Belle 0.5M 中文约50万条样本；

In [None]:
from transformers import TrainingArguments, Trainer
import torch
"""
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # 等效于 batch_size=16
    num_train_epochs=1,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    learning_rate=2e-4,
    fp16=True,
    report_to="none",  # 不推送到 wandb 等平台
    remove_unused_columns=False,
)"""

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4, # 增多的话能减少accumulation步数，单步计算更高效
    gradient_accumulation_steps=4,  # 等效于 batch_size=16
    max_steps=3000,
    save_strategy="epoch",                 # 每个 epoch 保存一次
    logging_steps=50,
    logging_dir="./logs",
    save_steps=500,
    save_total_limit=3,
    learning_rate=2e-4,
    report_to="none",                      # 不推送到 wandb 等平台
    fp16=True,       # 使用混合精度
)

# 只训练前 1000 条样本（调试）
# small_dataset = tokenized_dataset.select(range(1000))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)
trainer.train()
# trainer.train(resume_from_checkpoint=True)




In [None]:
!nvidia-smi

### 从 Checkpoint 恢复验证

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer
from peft import PeftModel

# 加载 base model
base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", trust_remote_code=True)

# 加载最后的 checkpoint（假设是 3000）
checkpoint_path = "./results/checkpoint-3000"
model = PeftModel.from_pretrained(base_model, checkpoint_path)
model.eval()

from transformers import TextStreamer

streamer = TextStreamer(tokenizer)

prompt = "### 指令：\n把这句话翻译成中文：I love learning new things every day.\n\n### 回答："
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    streamer=streamer
)


In [None]:
prompt = "### 指令：\n我是谁\n\n### 回答："
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    streamer=streamer
)

In [None]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", device_map="auto")
base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat")

lora_model = PeftModel.from_pretrained(base_model, "results/checkpoint-3000")
tokenizer = base_tokenizer  # tokenizer 是一样的

def compare_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # 原始模型
    base_out = base_model.generate(**inputs, max_new_tokens=100)
    base_text = tokenizer.decode(base_out[0], skip_special_tokens=True)

    # LoRA 微调模型
    lora_out = lora_model.generate(**inputs, max_new_tokens=100)
    lora_text = tokenizer.decode(lora_out[0], skip_special_tokens=True)

    print("\nPrompt:", prompt)
    print("=== 原始模型输出 ===")
    print(base_text)
    print("=== LoRA 微调输出 ===")
    print(lora_text)

prompts = [
        "### 指令：\n将下面这句话翻译成中文：\nLearning a new language can be fun and rewarding.### 回答：",
        "请将下面这句话翻译成中文：\nThe quick brown fox jumps over the lazy dog.",
        "翻成中文：She sells seashells by the seashore.",
        "### 指令：\n翻译：When life gives you lemons, make lemonade.\n\n### 回答：\n",
        "### 指令：\n请解释这句话的含义：\n“Knowledge is power.”\n### 回答：\n"
      ]
for p in prompts:
  compare_response(p)


# 轻量版 RAG 推理流程

##### 1. 加载医疗问答库

In [None]:
import json

corpus = []
with open("data/medical.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        for convo in item["conversations"]:
            if convo["role"] == "user":
                question = convo["content"]
            elif convo["role"] == "assistant":
                answer = convo["content"]
                corpus.append({"question": question, "answer": answer})


##### 2. 建立简单的向量索引

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

questions = [item["question"] for item in corpus]
# 加载更轻量的模型
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" , trust_remote_code=True)  # CPU 可跑的小模型200M，多语言
# 使用批处理 + 转为 numpy，防止 CPU 内存爆炸
corpus_embeddings = embedder.encode(
    questions,
    batch_size=32,                # 可根据机器内存调大调小
    convert_to_numpy=True,       # 避免 GPU 或 PyTorch tensor 堆积
    show_progress_bar=True
)

# 可选：保存向量（如果你要做RAG或之后用）
np.save("corpus_embeddings.npy", corpus_embeddings)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search_similar(query, top_k=5):
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
    top_indices = similarities.argsort()[-top_k:][::-1]

    for idx in top_indices:
        print(f"[score: {similarities[idx]:.4f}] Q: {corpus[idx]['question']}\nA: {corpus[idx]['answer']}\n")

search_similar("孕期如何控制血糖？")

##### 3. 构造 RAG Prompt 并调用 LoRA 模型生成回答

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# 原始 Qwen 模型路径
base_model_path = "Qwen/Qwen1.5-0.5B-Chat"

# LoRA 微调后的 checkpoint 路径
lora_model_path = "./results/checkpoint-3000"

# 加载 base 模型
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True
)

# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)

# 加载 LoRA 并合并
model = PeftModel.from_pretrained(base_model, lora_model_path)
model = model.merge_and_unload()  # 合并 LoRA 权重
model.eval()

# 保存合并后的模型
save_path = "./tinychat_lora"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

model = AutoModelForCausalLM.from_pretrained(
    "./tinychat_lora",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B-Chat", trust_remote_code=True)

def retrieve_context(query, top_k=3):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = cosine_similarity(query_embedding.cpu().numpy().reshape(1, -1), corpus_embeddings)[0]
    top_indices = np.argsort(cos_scores)[::-1][:top_k]
    return [corpus[idx] for idx in top_indices]

In [None]:
def rag_ask(query):
    context = retrieve_context(query)
    context_text = "\n\n".join([f"问：{item['question']}\n答：{item['answer']}" for item in context])

    prompt = f"""你是一个医疗问答助手，请根据以下已知问答来回答用户的新问题。
### 已知问答：
{context_text}
### 用户问题：
{query}
### 回答："""

    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    streamer = TextStreamer(tokenizer)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        do_sample=True,          # 采样模式，配合top_p使用
        top_p=0.8,
        temperature=0.7,
        streamer=streamer
    )
    # 只解码生成的部分，跳过 prompt 长度
    generated_text = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
    print(generated_text)


rag_ask("特异性皮炎能根治吗？")  # chat gpt 生成，有点问题


In [None]:
def rag_ask_ds(query):
    context = retrieve_context(query)
    context_text = "\n".join([f"问：{item['question']}\n答：{item['answer']}" for item in context])

    prompt = [
        {"role": "system", "content": "你是一个医疗问答助手，请根据已知信息回答问题"},
        {"role": "user", "content": f"已知信息：\n{context_text}\n\n问题：{query}"}
    ]
    
    inputs = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    streamer = TextStreamer(tokenizer)
    
    output = model.generate(
        inputs,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,
        temperature=0.3,
        streamer=streamer,
    )
    
    # 提取生成部分（跳过 prompt）
    generated = output[0][inputs.shape[-1]:]
    print(tokenizer.decode(generated, skip_special_tokens=True))

rag_ask_ds("特异性皮炎能根治吗？")  # 这个是可以的