In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh
!uv init
!uv add torch
!uv add transformers

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import gc

# 清理缓存
torch.cuda.empty_cache()
gc.collect()

# 加载模型和分词器
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    temperature=0.9,
    dtype="auto",
    trust_remote_code=False,
)

# 创建 pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=50,
    do_sample=False,
)

prompt = "What is Perl 6?"
output = generator(prompt)
print(output[0]['generated_text'])

# 打印模型
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda




Perl 6, formerly known as Raku, is a new and evolving programming language that aims to combine the best features of Perl and other languages. Perl 6 is designed to be a general-purpose, multi-
Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLUActivation()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resi

In [17]:
prompt = "The capital of France is"
# 对输入提示词进行分词
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
# 将 token ID 移动到 GPU 上
input_ids = input_ids.to("cuda")
# 获取模型在 llm_head 之前的输出
model_output = model.model(input_ids)
print(model_output)

# 获取 lm_head 的输出
lm_head_output = model.lm_head(model_output[0])
print(lm_head_output)

token_id = lm_head_output[0, -1].argmax(-1)
print(tokenizer.decode(token_id))

print(model_output[0].shape)
print(lm_head_output.shape)

BaseModelOutputWithPast(last_hidden_state=tensor([[[-0.3047,  1.1953,  0.2988,  ..., -0.3008,  0.6758,  0.1406],
         [-0.1318,  0.3320,  0.3906,  ...,  0.5703, -0.1494, -0.6172],
         [-0.5781,  1.0781,  1.5469,  ..., -0.4121,  0.2871,  0.3906],
         [-0.3984,  0.6680,  0.2520,  ..., -0.0732,  0.7969, -0.6719],
         [-0.6758,  0.6953,  0.5586,  ...,  0.2617,  0.1855, -0.2812]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<MulBackward0>), past_key_values=DynamicCache(layers=[DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, Dynamic