In [None]:
!pip install peft accelerate bitsandbytes evaluate sentencepiece scipy transformers[deepspeed] 

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
    PeftModel
)
import json
import random
from accelerate import Accelerator

In [None]:
from kaggle_secrets import UserSecretsClient
import os
import wandb

# 初始化 Secrets 客户端
user_secrets = UserSecretsClient()

# 获取密钥（键名需与 Secrets 中设置的一致）
wandb_key = user_secrets.get_secret("WANDB_API_KEY")
hf_token = user_secrets.get_secret("HF_TOKEN")

# 设置为环境变量
os.environ["WANDB_API_KEY"] = wandb_key
os.environ["HF_TOKEN"] = hf_token

In [None]:
wandb.login(key = wandb_key)
run = wandb.init(
    project='Fine tuning Qwen 7B', 
    job_type="training"
)

In [None]:
def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    def wrap_prompt(src, tgt, lang):
        # 构建统一的系统提示
        system_prompt = f"""<|im_start|>system
你是专注于xxx领域的资深翻译专家，专门负责将英文文档精准翻译成 {lang}。

## 翻译原则：

### 1. 术语准确性
- 严格使用xxx行业标准术语
- 保持技术参数的精确性和专业性
- 遵循xxx领域的权威表达方式

### 2. 句式结构与技术逻辑忠实性
- 英文常见被动语态，在{lang}中需转换为符合习惯的**主动语态或自然表述**，避免生硬直译
- 对复杂英文长句，按{lang}习惯进行合理拆分或重组，确保**逻辑清晰、易于理解**，同时**不丢失任何细节**
- 逻确保技术逻辑连接词翻译准确，因果关系、条件关系明确

**请逐句审阅，严格遵循以上所有规范。**<|im_end|>"""

        user_prompt = f"<|im_start|>user\n### 请将以下文本准确翻译成{lang}：\n{src}/no_think<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"

        # 构建助手回复
        assistant_response = f"{tgt}<|im_end|>"

        return {
                "systemprompt": system_prompt,
                "userprompt": user_prompt,
                "prompt": f"{system_prompt}\n{user_prompt}",
                "completion": assistant_response,
            }

    def process_pairs(pairs, shuffle=True):
        # 构建ChatML格式的数据，按语言类型分组
        zhcn_samples = []
        zhtw_samples = []

        for src, tgt, *tgt2 in pairs:
            zhcn_samples.append(wrap_prompt(src, tgt, "简体中文"))
            if len(tgt2) > 0:
                zhtw_samples.append(wrap_prompt(src, tgt2[0], "繁体中文"))

        # 按语言类型分组，避免在同一序列中混合不同语言
        processed = []
        if shuffle:
            random.shuffle(zhcn_samples)
            random.shuffle(zhtw_samples)
        # 先添加简体中文样本
        processed.extend(zhcn_samples)
        # 再添加繁体中文样本
        processed.extend(zhtw_samples)

        return processed
    
    train_data = process_pairs(data['train'])
    val_data = process_pairs(data['validation'],False)
    
    return train_data, val_data

def create_datasets(train_data, val_data, tokenizer, accelerator):
    """创建数据集"""
    def tokenize_function(examples):
        # 分别处理输入和输出
        inputs = examples["prompt"]
        completion = examples["completion"]
        
        # 构建完整的prompt
        prompts = []
        labels = []
        
        for inp, out in zip(inputs, completion):
            # 对输入进行tokenize
            input_ids = tokenizer(
                inp,
                truncation=True,
                max_length=256,  # 减小长度避免过长序列
                add_special_tokens=False,  # 不添加特殊token
            )["input_ids"]
            
            # 对输出进行tokenize
            output_ids = tokenizer(
                out,
                truncation=True,
                max_length=256,
                add_special_tokens=False,
            )["input_ids"]
            
            # 合并prompt
            prompts.append(inp + out)
            
            # 构建标签，输入部分用-100，输出部分用实际token id
            label = [-100] * len(input_ids) + output_ids
            labels.append(label)
        
        # 对所有序列进行padding
        model_inputs = tokenizer(
            prompts,
            truncation=True,
            max_length=512,
            padding="max_length",
            return_tensors="pt",
        )
        
        # 对标签进行padding
        labels_padded = torch.full((len(labels), 512), -100)  # 默认填充-100
        for i, label in enumerate(labels):
            length = min(len(label), 512)  # 避免超出最大长度
            labels_padded[i, :length] = torch.tensor(label[:length])
        
        model_inputs["labels"] = labels_padded
        
        return model_inputs

    # 转换为HuggingFace数据集格式
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    
    # 使用 accelerator 控制数据集处理
    with accelerator.main_process_first():
        train_dataset = train_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=train_dataset.column_names,
            load_from_cache_file=False,
            desc="Processing training dataset",
        )
        
        val_dataset = val_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=val_dataset.column_names,
            load_from_cache_file=False,
            desc="Processing validation dataset",
        )
    
    # 等待所有进程完成数据处理
    accelerator.wait_for_everyone()
    
    return train_dataset, val_dataset

In [None]:

def peft_config():
    """创建LoRA配置"""
    return LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=64,  # 降低LoRA秩以减少参数量
        lora_alpha=128,
        lora_dropout=0.05,
        # 针对Qwen的结构调整target_modules
        target_modules=[
            "q_proj",
            # "k_proj",
            "v_proj",
            # "o_proj",
            "gate_proj",
            # "up_proj",
            "down_proj"
        ],
        bias="none",
        # fan_in_fan_out=True  # 适配并行模式
    )

def deepspeed_config():
    """创建 DeepSpeed 配置"""
    return {
        "is_deepspeed_zero3_enabled": True,
        "fp16": {
            "enabled": True,
            "loss_scale": 0,
            "loss_scale_window": 1000,
            "initial_scale_power": 16,
            "hysteresis": 2,
            "min_loss_scale": 1
        },
        "zero_optimization": {
            "stage": 3,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True
            },
            "offload_param": {
                "device": "cpu",
                "pin_memory": True
            },
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": "auto",
            "stage3_prefetch_bucket_size": "auto",
            "stage3_param_persistence_threshold": "auto",
            "stage3_max_live_parameters": 1e9,
            "stage3_max_reuse_distance": 1e9,
            "stage3_gather_16bit_weights_on_model_save": True,
        },
        # "gradient_accumulation_steps": "auto",  # 让 DeepSpeed 自动适应
        "gradient_clipping": "auto",
        # "train_micro_batch_size_per_gpu": "auto",  # 让 DeepSpeed 自动适应
        # "train_batch_size": "auto",  # 让 DeepSpeed 自动适应
        "train_micro_batch_size_per_gpu": 1,
        "train_batch_size": 2,
        "gradient_accumulation_steps": 1,
    }

In [None]:
def train():
    try:
        accelerator = Accelerator()
        accelerator.print("\n=== 初始化训练 ===")
        accelerator.print(f"进程数: {accelerator.num_processes}")
        accelerator.print(f"分布式环境: {accelerator.distributed_type}")

        model_name= "Qwen/Qwen2.5-7B-Instruct"
        
        # 配置4bit量化
        accelerator.print("\n=== 配置量化参数 ===")
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_storage=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            llm_int8_threshold=10.0
        )

        output_dir = 'cpt'
        ds_config = deepspeed_config()
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=3,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            evaluation_strategy="steps",
            eval_steps=100,
            save_strategy="steps",
            save_steps=100,
            save_total_limit=3,
            # learning_rate=1e-5,  # 降低学习率
            # weight_decay=0.001,  # 减小权重衰减
            # warmup_ratio=0.1,
            # max_grad_norm=0.3,  # 添加梯度裁剪
            fp16=True,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            push_to_hub=False,
            # gradient_checkpointing=True,
            # report_to=["none"],
            optim="adamw_torch",
            resume_from_checkpoint=True,
            deepspeed=ds_config,
            local_rank=-1,
        )

        # 加载模型
        accelerator.print("\n=== 加载模型 ===")
        try:
            import deepspeed

            # with deepspeed.zero.Init(config_dict_or_path=ds_config):
            with deepspeed.zero.Init():
                model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    # quantization_config=quantization_config,
                    # device_map="auto",
                    # low_cpu_mem_usage=True,
                    trust_remote_code=True,
                    torch_dtype=torch.float16
                )
                model = get_peft_model(model, peft_config())
                accelerator.print("模型加载成功")
        except Exception as e:
            accelerator.print(f"模型加载失败: {str(e)}")
            raise

        # 准备 LoRA 训练
        accelerator.print("\n=== 配置 LoRA ===")
        try:
            # model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=False)
            model = get_peft_model(model, peft_config())
            accelerator.print("LoRA 配置成功")
        except Exception as e:
            accelerator.print(f"LoRA 配置失败: {str(e)}")
            raise

        # 加载 tokenizer
        accelerator.print("\n=== 加载 Tokenizer ===")
        try:
            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            tokenizer.pad_token = tokenizer.eos_token
            accelerator.print("Tokenizer 加载成功")
        except Exception as e:
            accelerator.print(f"Tokenizer 加载失败: {str(e)}")
            raise

        # 加载数据集
        accelerator.print("\n=== 加载数据集 ===")
        try:
            train_data, val_data = load_corpus('./translation_dataset.json')
            train_dataset, val_dataset = create_datasets(train_data, val_data, tokenizer, accelerator)
        except Exception as e:
            accelerator.print(f"数据集加载失败: {str(e)}")
            raise
            
        # 配置训练器
        accelerator.print("\n=== 配置训练器 ===")
        try:
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
            )
            accelerator.print("训练器配置成功")
        except Exception as e:
            accelerator.print(f"训练器配置失败: {str(e)}")
            raise

        # 开始训练
        accelerator.print("\n=== 开始训练 ===")
        try:
            trainer.train()
            accelerator.print("训练完成")
        except Exception as e:
            accelerator.print(f"训练过程出错: {str(e)}")
            raise

        # 保存模型
        accelerator.print("\n=== 保存模型 ===")
        try:
            output_dir="model"
            trainer.save_model(output_dir)
            accelerator.print("模型保存成功")
        except Exception as e:
            accelerator.print(f"模型保存失败: {str(e)}")
            raise

    except Exception as e:
        accelerator.print(f"\n=== 训练异常 ===\n{str(e)}")
        # 打印完整的错误堆栈
        import traceback
        accelerator.print(traceback.format_exc())
        raise
    finally:
        # 清理资源
        accelerator.print("\n=== 清理资源 ===")
        if 'model' in locals():
            del model
        if 'trainer' in locals():
            del trainer
        torch.cuda.empty_cache()
        import gc
        gc.collect()
        accelerator.print("资源清理完成")

In [None]:
from accelerate import notebook_launcher

print(torch.cuda.is_initialized())

notebook_launcher(
    train,  # 训练函数
    args=(),  # 空元组，因为train函数不需要额外参数
    num_processes=2,  # 使用2个GPU
    # mixed_precision="fp16",  # 使用混合精度训练
)