# 输出显卡

In [4]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("PyTorch version:", torch.__version__)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU.")
print(f"Available GPUs: {torch.cuda.device_count()}")
print(f"Current GPU: {torch.cuda.current_device()}")

CUDA available: True
PyTorch version: 2.5.1
GPU: NVIDIA GeForce RTX 2060
Available GPUs: 1
Current GPU: 0


# 输出模型结构

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device_id = 0  # ← 修改这里选择GPU编号
device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")

model_path = r"F:\edged\cpm"

# 加载模型时指定设备
model = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)  # 将模型直接加载到指定设备
print(model)

MiniCPM3ForCausalLM(
  (model): MiniCPM3Model(
    (embed_tokens): Embedding(73448, 2560)
    (layers): ModuleList(
      (0-61): 62 x MiniCPMDecoderLayer(
        (self_attn): MiniCPMSdpaAttention(
          (q_a_proj): Linear(in_features=2560, out_features=768, bias=False)
          (q_a_layernorm): MiniCPMRMSNorm()
          (q_b_proj): Linear(in_features=768, out_features=3840, bias=False)
          (kv_a_proj_with_mqa): Linear(in_features=2560, out_features=288, bias=False)
          (kv_a_layernorm): MiniCPMRMSNorm()
          (kv_b_proj): Linear(in_features=256, out_features=5120, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): MiniCPMLongRoPE()
        )
        (mlp): MiniCPMMLP(
          (gate_proj): Linear(in_features=2560, out_features=6400, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6400, bias=False)
          (down_proj): Linear(in_features=6400, out_features=2560, bias=False)
   

# 输出模型元信息和张量信息

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 加载预训练模型和分词器
model_path = "/home/ztf/cpm"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model.eval()

#  输出模型的元信息
print(model.config)
with torch.no_grad():
    for name, param in model.named_parameters():
        print(f"Layer: {name} | Shape Type: {param.shape}| Data Type: {param}")
        # if name==""
# if name == "transformer.ln_f.weight":
#     print(f"Layer: {name} | Shape Type: {param.shape}| Data Type: {param}")
# if name == "transformer.ln_f.bias":
#     print(f"Layer: {name} | Shape Type: {param.shape} | Data Type: {param}")
#             df = pd.DataFrame(param.numpy())
            # df.to_csv('output.csv', index=False)
        # print(f"Layer: {name} | Shape: {param.shape}")
    # for name, module in model.named_modules():
    #      print(f"Name: {name}, Module: {module}")

MiniCPM3Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/home/ztf/cpm",
  "architectures": [
    "MiniCPM3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_minicpm.MiniCPM3Config",
    "AutoModel": "modeling_minicpm.MiniCPM3Model",
    "AutoModelForCausalLM": "modeling_minicpm.MiniCPM3ForCausalLM",
    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPM3ForCausalLM",
    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPM3ForSequenceClassification"
  },
  "bos_token_id": 1,
  "dim_model_base": 256,
  "eos_token_id": [
    2,
    73440
  ],
  "head_dim": 96,
  "hidden_act": "silu",
  "hidden_size": 2560,
  "initializer_range": 0.1,
  "intermediate_size": 6400,
  "kv_lora_rank": 256,
  "max_position_embeddings": 32768,
  "model_type": "minicpm3",
  "num_attention_heads": 40,
  "num_hidden_layers": 62,
  "num_key_value_heads": 40,
  "pretraining_tp": 1,
  "q_lora_rank": 768,
  "qk_nope_

# 输出每层打印信息

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


def print_tensor_elements(tensor, label="Tensor", num_elements=5):
    if isinstance(tensor, torch.Tensor):
        elements = tensor.flatten()
        print(f"{label}: shape={tensor.shape}")
        print("First 5 elements:", elements[:num_elements].tolist())
        print(
            "Last 5 elements:",
            (
                elements[-num_elements:].tolist()
                if len(elements) >= num_elements
                else elements.tolist()
            ),
        )
    else:
        print(f"{label}: Not a Tensor")


def hook_fn(module, input, output):
    print(f"Layer: {module.__class__}")

    # 处理输入张量
    if input and isinstance(input[0], torch.Tensor):
        print_tensor_elements(input[0], label="Input")
    else:
        print("error")
    # 处理输出张量
    if input and isinstance(output[0], torch.Tensor):
        print_tensor_elements(output[0], label="Input")
    else:
        print("error")

    print("-" * 50)


# 选择设备
device_id = 0  # ← 修改这里选择GPU编号
device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")
# 加载预训练模型和分词器
model_path = "/home/ztf/cpm"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model.eval()
# 遍历所有子模块并注册钩子
hooks = []
for name, module in model.named_modules():
    if not isinstance(module, (torch.nn.ModuleList, torch.nn.Sequential)):
        hooks.append(module.register_forward_hook(hook_fn))
        # if name == 'transformer.ln_f':
        #     hooks.append(module.register_forward_hook(hook_fn))

# 手动构建张量并进行推理
inputs = "Once upon a time,"
generated_tokens = torch.tensor([[59422]])
# 将文本转换为模型输入
input_ids = tokenizer(inputs, return_tensors="pt").input_ids

# 使用模型进行推理
with torch.no_grad():  # 确保推理过程中不计算梯度以节省内存
    outputs = model.generate(input_ids , max_length=7, do_sample=True)
print(outputs)
for i in range(outputs.shape[0]):  # 遍历所有生成的序列
    print(tokenizer.decode(outputs[i], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


Layer: <class 'torch.nn.modules.sparse.Embedding'>
Input: shape=torch.Size([1, 6])
First 5 elements: [1, 11152, 6138, 1348, 1817]
Last 5 elements: [11152, 6138, 1348, 1817, 59342]
Input: shape=torch.Size([6, 2560])
First 5 elements: [0.00185394287109375, 0.046875, -0.0030517578125, -0.013671875, -0.00518798828125]
Last 5 elements: [-0.007110595703125, -0.0791015625, 0.083984375, -0.0281982421875, -0.05908203125]
--------------------------------------------------
Layer: <class 'transformers_modules.cpm.modeling_minicpm.MiniCPMRMSNorm'>
Input: shape=torch.Size([1, 6, 2560])
First 5 elements: [0.022247314453125, 0.5625, -0.03662109375, -0.1640625, -0.062255859375]
Last 5 elements: [-0.0853271484375, -0.94921875, 1.0078125, -0.33837890625, -0.708984375]
Input: shape=torch.Size([6, 2560])
First 5 elements: [0.05698112025856972, 1.440707802772522, -0.09379608184099197, -0.4202064275741577, -0.15945333242416382]
Last 5 elements: [-0.16926905512809753, -1.88302743434906, 1.9992636442184448, -0

# rope_ext

In [12]:

import math
import torch
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float32
max_seq_len_cached = 1
dim = 32  #  qk_rope_head_dim
rope_theta = 10000.0  # rope_theta
max_position_embeddings = 32768

long_factor = torch.tensor(
    [
        1.0591234137867171,
        1.1241891283591912,
        1.2596935748670968,
        1.5380380402321725,
        2.093982484148734,
        3.1446935121267696,
        4.937952647693647,
        7.524541999994549,
        10.475458000005451,
        13.062047352306353,
        14.85530648787323,
        15.906017515851266,
        16.461961959767827,
        16.740306425132907,
        16.87581087164081,
        16.940876586213285,
    ]
)
original_max_position_embeddings = 32768
short_factor = torch.tensor(
    [
        1.0591234137867171,
        1.1241891283591912,
        1.2596935748670968,
        1.5380380402321725,
        2.093982484148734,
        3.1446935121267696,
        4.937952647693647,
        7.524541999994549,
        10.475458000005451,
        13.062047352306353,
        14.85530648787323,
        15.906017515851266,
        16.461961959767827,
        16.740306425132907,
        16.87581087164081,
        16.940876586213285,
    ]
)


class MiniCPMRotaryEmbedding(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()

        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (
            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            # seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
            seq_len=max_position_embeddings,
            device=self.inv_freq.device,
            dtype=torch.float32,
        )
        #   let t=(theta as f32).powf(k as f32 / dh as f32).recip()*factor.recip()*self.val() as f32;

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            # TODO 一次计算完之后按位置取
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )


class MiniCPMLongRoPE(MiniCPMRotaryEmbedding):
    """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""

    def __init__(
        self,
        dim,
        max_position_embeddings=2048,
        base=10000,
        device=None,
        short_factor=None,
        long_factor=None,
        original_max_position_embeddings=None,
    ):
        self.short_factor = short_factor
        self.long_factor = long_factor
        self.original_max_position_embeddings = original_max_position_embeddings
        scale = max_position_embeddings / self.original_max_position_embeddings
        self.scaling_factor = math.sqrt(
            1 + math.log(scale) / math.log(self.original_max_position_embeddings)
        )
        super().__init__(dim, max_position_embeddings, base, device)

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(
            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
        )
        if seq_len > self.original_max_position_embeddings:
            ext_factors = torch.tensor(
                self.long_factor, dtype=torch.float32, device=device
            )
        else:
            ext_factors = torch.tensor(
                self.short_factor, dtype=torch.float32, device=device
            )
        print(torch.outer(t, 1.0 / ext_factors).to(device=device).shape)
        print(self.inv_freq.shape)
        freqs = torch.mul(
            torch.outer(t, 1.0 / ext_factors).to(device=device),
            self.inv_freq.to(device=device).to(dtype),
        ) 
        emb = torch.cat((freqs, freqs), dim=-1)
        print(emb.sin().to(dtype) * self.scaling_factor)
        self.register_buffer(
            "cos_cached", emb.cos().to(dtype) * self.scaling_factor, persistent=False
        )
        self.register_buffer(
            "sin_cached", emb.sin().to(dtype) * self.scaling_factor, persistent=False
        )


model=MiniCPMLongRoPE(
    dim,
    max_position_embeddings=max_position_embeddings,
    short_factor=short_factor,
    long_factor=long_factor,
    base=rope_theta,
    original_max_position_embeddings=max_position_embeddings,
)
model.forward(long_factor,1)

torch.Size([32768, 16])
torch.Size([16])
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.1001e-01,  4.7962e-01,  2.4841e-01,  ...,  3.3592e-05,
          1.8739e-05,  1.0497e-05],
        [ 9.5000e-01,  8.4171e-01,  4.8124e-01,  ...,  6.7184e-05,
          3.7477e-05,  2.0994e-05],
        ...,
        [-6.4275e-01, -2.5325e-03,  4.6815e-01,  ...,  8.9150e-01,
          5.7612e-01,  3.3719e-01],
        [-9.9751e-01, -4.8165e-01,  6.7293e-01,  ...,  8.9151e-01,
          5.7613e-01,  3.3720e-01],
        [-5.2681e-01, -8.4389e-01,  8.3556e-01,  ...,  8.9153e-01,
          5.7615e-01,  3.3721e-01]])


  ext_factors = torch.tensor(


(tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]))

# 输出模型指定

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


def print_tensor_elements(tensor, label="Tensor", num_elements=5):
    if isinstance(tensor, torch.Tensor):
        elements = tensor.flatten()
        print(f"{label}: shape={tensor.shape}")
        print("First 5 elements:", elements[:num_elements].tolist())
        print(
            "Last 5 elements:",
            (
                elements[-num_elements:].tolist()
                if len(elements) >= num_elements
                else elements.tolist()
            ),
        )
    else:
        print(f"{label}: Not a Tensor")


def hook_fn(module, input, output):
    # 处理输入张量
    if input and isinstance(input[0], torch.Tensor):
        print_tensor_elements(input[0], label="Input")
    else:
        print("error")
    # 处理输出张量
    if input and isinstance(output[0], torch.Tensor):
        print_tensor_elements(output[0], label="Input")
    else:
        print("error")

    print("-" * 50)


# 选择设备
device_id = 0  # ← 修改这里选择GPU编号
device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu")
# 加载预训练模型和分词器
model_path = "/home/ztf/cpm"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model.eval()
# 遍历所有子模块并注册钩子
hooks = []
for name, module in model.named_modules():
    layer_name=module.__class__.__name__
    match layer_name:
        case "MiniCPMLongRoPE":
            hooks.append(module.register_forward_hook(hook_fn))
        case "MiniCPMSdpaAttention":
            print(module.__class__)

# # 手动构建张量并进行推理
# inputs = "Once upon a time,"
# generated_tokens = torch.tensor([[59422]])
# # 将文本转换为模型输入
# input_ids = tokenizer(inputs, return_tensors="pt").input_ids

# # 使用模型进行推理
# with torch.no_grad():  # 确保推理过程中不计算梯度以节省内存
#     outputs = model.generate(generated_tokens, max_length=2, do_sample=True)
# print(outputs)
# for i in range(outputs.shape[0]):  # 遍历所有生成的序列
#     print(tokenizer.decode(outputs[i], skip_special_tokens=True))

<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.MiniCPMSdpaAttention'>
<class 'transformers_modules.cpm.modeling_minicpm.M