diff --git a/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb b/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb
index 5ffed5b32..359f7e9d4 100644
--- a/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb
+++ b/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.ipynb
@@ -30,7 +30,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!wget https://openi.pcl.ac.cn/mindnlp/self-llm/raw/branch/master/dataset/huanhuan.json -O huanhuan.json --no-check-certificate"
+    "!wget \"https://gh-proxy.com/https://raw.githubusercontent.com/datawhalechina/self-llm/refs/heads/master/dataset/huanhuan.json\" -O huanhuan.json --no-check-certificate"
    ]
   },
   {
@@ -164,7 +164,7 @@
    "source": [
     "import torch\n",
     "\n",
-    "model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-7B-Instruct', torch_dtype=torch.float16, device_map='auto')"
+    "model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-7B-Instruct', torch_dtype=torch.float16, device_map=0)"
    ]
   },
   {
@@ -265,8 +265,8 @@
    "source": [
     "args = TrainingArguments(\n",
     "    output_dir=\"./output/Qwen2.5_instruct_lora\",\n",
-    "    per_device_train_batch_size=4,\n",
-    "    gradient_accumulation_steps=4,\n",
+    "    per_device_train_batch_size=3,\n",
+    "    gradient_accumulation_steps=5,\n",
     "    logging_steps=10,\n",
     "    num_train_epochs=3,\n",
     "    save_steps=100, \n",
@@ -336,7 +336,7 @@
     "from peft import PeftModel\n",
     "\n",
     "mode_path = 'Qwen/Qwen2.5-7B-Instruct'\n",
-    "lora_path = './output/Qwen2.5_instruct_lora/checkpoint-702' # 这里改称你的 lora 输出对应 checkpoint 地址\n",
+    "lora_path = './output/Qwen2.5_instruct_lora/checkpoint-747' # 这里改称你的 lora 输出对应 checkpoint 地址\n",
     "\n",
     "# 加载tokenizer\n",
     "tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)\n",
diff --git a/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.py b/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.py
index a3d53f09f..614aaff41 100644
--- a/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.py
+++ b/examples/transformers/peft/lora/Qwen2.5-7B-Instruct-Lora.py
@@ -5,13 +5,17 @@
 import mindnlp
 import mindspore
 
-# mindspore.set_context(pynative_synchronize=True)
 from datasets import Dataset
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
 
+# download file
+"""
+wget "https://gh-proxy.com/https://raw.githubusercontent.com/datawhalechina/self-llm/refs/heads/master/dataset/huanhuan.json"
+"""
+
 # 将JSON文件转换为CSV文件
-df = pd.read_json('/home/lvyufeng/lvyufeng/mindnlp/examples/transformers/peft/lora/huanhuan.json')
+df = pd.read_json('./huanhuan.json')
 ds = Dataset.from_pandas(df)
 
 # 处理数据集
@@ -60,8 +64,8 @@ def process_func(example):
 # 配置训练参数
 args = TrainingArguments(
     output_dir="./output/Qwen2.5_instruct_lora",
-    per_device_train_batch_size=4,
-    gradient_accumulation_steps=4,
+    per_device_train_batch_size=3,
+    gradient_accumulation_steps=5,
     logging_steps=10,
     num_train_epochs=3,
     save_steps=100, 
@@ -87,7 +91,7 @@ def process_func(example):
 from peft import PeftModel
 
 mode_path = 'Qwen/Qwen2.5-7B-Instruct'
-lora_path = './output/Qwen2.5_instruct_lora/checkpoint-702' # 这里改称你的 lora 输出对应 checkpoint 地址
+lora_path = './output/Qwen2.5_instruct_lora/checkpoint-747' # 这里改称你的 lora 输出对应 checkpoint 地址
 
 # 加载tokenizer
 tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
diff --git a/mindnlp/core/distributed/device_mesh.py b/mindnlp/core/distributed/device_mesh.py
index 607c7182a..1361cbc94 100644
--- a/mindnlp/core/distributed/device_mesh.py
+++ b/mindnlp/core/distributed/device_mesh.py
@@ -8,8 +8,8 @@
 from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union
 
 from mindnlp import core
-from core.distributed import is_available
-from core.utils._typing_utils import not_none
+from mindnlp.core.distributed import is_available
+from mindnlp.core.utils._typing_utils import not_none
 
 
 __all__ = ["init_device_mesh", "DeviceMesh"]
@@ -37,7 +37,7 @@ def _init_device_mesh_stub():
 
 else:
     from .c10d import Backend as C10dBackend
-    from core.distributed.distributed_c10d import (
+    from mindnlp.core.distributed.distributed_c10d import (
         _find_pg_by_ranks_and_tag,
         _get_default_group,
         _get_group_tag,
diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py
index 88309432d..eaa23fad0 100644
--- a/mindnlp/core/nn/functional.py
+++ b/mindnlp/core/nn/functional.py
@@ -1197,9 +1197,9 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
 
     attn_weight = query.float() @ key.transpose(-2, -1).float() * scale_factor
     attn_weight += attn_bias.float()
-    attn_weight = softmax(attn_weight, dim=-1)
+    attn_weight = softmax(attn_weight, dim=-1, dtype=core.float32).to(query.dtype)
     attn_weight = dropout(attn_weight, dropout_p, training=True)
-    return (attn_weight @ value.float()).to(query.dtype)
+    return attn_weight @ value
 
 
 def _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads):
diff --git a/mindnlp/core/npu/__init__.py b/mindnlp/core/npu/__init__.py
index 2d7af63ab..04a1715ca 100644
--- a/mindnlp/core/npu/__init__.py
+++ b/mindnlp/core/npu/__init__.py
@@ -16,6 +16,7 @@
 from mindnlp import core
 from mindnlp.core.executor import execute
 from ..configs import SUPPORT_BF16, ON_A1
+from . import random
 
 FloatTensor = core.FloatTensor
 HalfTensor = core.FloatTensor
diff --git a/setup.py b/setup.py
index 6c6b44b62..c4e2a8fe8 100644
--- a/setup.py
+++ b/setup.py
@@ -159,7 +159,9 @@ def run(self):
         'mindspore>=2.5.0',
         'tqdm',
         'requests',
+        'accelerate', # hf dependency
         'transformers>=4.55.0', # hf dependency
+        'peft', # hf dependency
         'datasets', # hf dependency
         'evaluate', # hf dependency
         'tokenizers', # hf dependency