In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
# 模型配置
from transformers import AutoConfig

hidden_size = 1024
# 中间层取 8/3 倍，按 128 向上取整
# intermediate_size = (int(hidden_size * 8/3 / 128) + 1) * 128
intermediate_size = hidden_size * 4

# 只改动我们需要调整的参数，其余保持不变
config = AutoConfig.for_model(
    model_type='qwen2',
    hidden_size=hidden_size,
    intermediate_size=intermediate_size,
    num_attention_heads=16,
    num_hidden_layers=48,
    num_key_value_heads=8                  # 分为 8 组
)
print(config)

'''
LlamaConfig {
  'attention_bias': false,                 # 不使用注意力偏置
  'attention_dropout': 0.0,                # 注意力层的 dropout 比例
  'bos_token_id': 1,                       # bos_token (begin of sentence) 的 id
  'eos_token_id': 2,                       # eos_token (end of sentence) 的 id
  'hidden_act': 'silu',                    # 隐藏层激活函数类型，silu 即 SwiGLU
  'hidden_size': 256,                      # 隐藏层维度大小
  'initializer_range': 0.02,               # 权重初始化范围，会被后面的 Kaiming 初始化覆盖
  'intermediate_size': 768,                # 中间层大小，采用 8/3 倍而非 4 倍
  'max_position_embeddings': 2048,
  'model_type': 'llama',
  'num_attention_heads': 16,
  'num_hidden_layers': 4,
  'num_key_value_heads': 8,
  'pretraining_tp': 1,
  'rms_norm_eps': 1e-06,
  'rope_scaling': null,
  'rope_theta': 10000.0,
  'tie_word_embeddings': false,            # 头尾 embedding 和 lm_head 是否共享权重
  'transformers_version': '4.40.0',
  'use_cache': true,
  'vocab_size': 32000
}
'''

Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 48,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



"\nLlamaConfig {\n  'attention_bias': false,                 # 不使用注意力偏置\n  'attention_dropout': 0.0,                # 注意力层的 dropout 比例\n  'bos_token_id': 1,                       # bos_token (begin of sentence) 的 id\n  'eos_token_id': 2,                       # eos_token (end of sentence) 的 id\n  'hidden_act': 'silu',                    # 隐藏层激活函数类型，silu 即 SwiGLU\n  'hidden_size': 256,                      # 隐藏层维度大小\n  'initializer_range': 0.02,               # 权重初始化范围，会被后面的 Kaiming 初始化覆盖\n  'intermediate_size': 768,                # 中间层大小，采用 8/3 倍而非 4 倍\n  'max_position_embeddings': 2048,\n  'model_type': 'llama',\n  'num_attention_heads': 16,\n  'num_hidden_layers': 4,\n  'num_key_value_heads': 8,\n  'pretraining_tp': 1,\n  'rms_norm_eps': 1e-06,\n  'rope_scaling': null,\n  'rope_theta': 10000.0,\n  'tie_word_embeddings': false,            # 头尾 embedding 和 lm_head 是否共享权重\n  'transformers_version': '4.40.0',\n  'use_cache': true,\n  'vocab_size': 32000\n}\n"

In [2]:
# 模型
import torch
from transformers import AutoModelForCausalLM

# 能用 cuda 就用 cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 从配置加载模型
model = AutoModelForCausalLM.from_config(
    config,
    torch_dtype=torch.float32   # 全精度训练
).to(device)                    # 迁移到 device 上

In [3]:
# 打印模型的每一层及其参数大小
def print_model_parameters(model):
    print('Layer Name & Parameters')
    print('----------------------------')
    total_params = 0
    for name, parameter in model.named_parameters():
        param_size = parameter.size()
        param_count = torch.prod(torch.tensor(param_size)).item()
        total_params += param_count
        print(f'{name:50} | Size: {str(param_size):30} | Count: {str(param_count):20}')
    print('----------------------------')
    print(f'Total Parameters: {total_params} ({total_params / 1000000:.1f} M)')

print_model_parameters(model)

Layer Name & Parameters
----------------------------
model.embed_tokens.weight                          | Size: torch.Size([151936, 1024])     | Count: 155582464           
model.layers.0.self_attn.q_proj.weight             | Size: torch.Size([1024, 1024])       | Count: 1048576             
model.layers.0.self_attn.q_proj.bias               | Size: torch.Size([1024])             | Count: 1024                
model.layers.0.self_attn.k_proj.weight             | Size: torch.Size([512, 1024])        | Count: 524288              
model.layers.0.self_attn.k_proj.bias               | Size: torch.Size([512])              | Count: 512                 
model.layers.0.self_attn.v_proj.weight             | Size: torch.Size([512, 1024])        | Count: 524288              
model.layers.0.self_attn.v_proj.bias               | Size: torch.Size([512])              | Count: 512                 
model.layers.0.self_attn.o_proj.weight             | Size: torch.Size([1024, 1024])       | Count: 1048576 

In [4]:
# Kaiming 初始化
def kaiming_initialization(model):
    for name, param in model.named_parameters():
        if 'weight' in name and param.dim() > 1:
            torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
        elif 'bias' in name:
            # 一般偏置项可以初始化为 0
            torch.nn.init.constant_(param, 0)

kaiming_initialization(model)

In [5]:
# 分词器
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('./baseline_3')
tokenizer = AutoTokenizer.from_pretrained('/Vol2/minxin/Models/Qwen2___5-32B-Instruct')


'''
LlamaTokenizerFast(name_or_path='NousResearch/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
    0: AddedToken('<unk>', rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    1: AddedToken('<s>', rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
    2: AddedToken('</s>', rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
'''
print(tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Qwen2TokenizerFast(name_or_path='/Vol2/minxin/Models/Qwen2___5-32B-Instruct', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, 

In [6]:
def inference(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    input_text: str = 'Once upon a time, ',
    max_new_tokens: int = 16
):
    inputs = tokenizer(input_text, return_tensors='pt').to(device)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=40,
        top_p=0.95,
        temperature=0.8
    )
    generated_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )
    # print(outputs)
    print(generated_text)

inference(model, tokenizer)

'''
Once upon a time, Hostย crimeine /\ könnenlinewidth measurementresol perfectly Taylor measèresiones assetviron
'''

Once upon a time, ","# yayınlan_dimensionを取り selects죕Drug navbar WIDTHً shocking tặng มกร.moves�มวล


'\nOnce upon a time, Hostย crimeine /\\ könnenlinewidth measurementresol perfectly Taylor measèresiones assetviron\n'

In [8]:
model.save_pretrained("./minimind_1.1B")
tokenizer.save_pretrained("./minimind_1.1B")

('./minimind_1.1B/tokenizer_config.json',
 './minimind_1.1B/special_tokens_map.json',
 './minimind_1.1B/vocab.json',
 './minimind_1.1B/merges.txt',
 './minimind_1.1B/added_tokens.json',
 './minimind_1.1B/tokenizer.json')