In [None]:
import numpy as np
import time
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
model_path = "/dataset/crosspipe/OriginModel/Llama-2-7b-chat-hf/"
device = "cuda:2" if torch.cuda.is_available() else "cpu"
model = LlamaForCausalLM.from_pretrained(model_path, device_map=device)
tokenizer = LlamaTokenizer.from_pretrained(model_path)

In [None]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer
# 加载模型和分词器
model_path = "/dataset/crosspipe/OriginModel/Llama-2-7b-chat-hf/"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = LlamaForCausalLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 输入文本
input_text = "Once upon a time"
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

# 获取模型的输出，包括 hidden states
with torch.no_grad():
    outputs = model(
        inputs,
        output_hidden_states=True,  # 启用 hidden states 的输出
        return_dict=True  # 返回字典形式的输出，便于访问
    )

# 获取 hidden states
hidden_states = outputs.hidden_states

# 查看每一层的 hidden states 及其维度
for i, layer_hidden_states in enumerate(hidden_states):
    print(f"Layer {i} hidden states shape: {layer_hidden_states.shape}")


In [None]:
context_ids = inputs[:, :-1]
next_word_ids = inputs[:, -1:]

In [None]:
inputs
context_ids
next_word_ids

In [None]:
attention_weights = outputs.attentions

In [None]:
if outputs.attentions is not None:
    for layer_idx, layer_attention in enumerate(outputs.attentions):
        print(f"Layer {layer_idx}: {layer_attention[-1].shape}")
else:
    print("Attention weights are None!")


In [None]:
outputs

In [None]:
generated_ids = outputs.sequences
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(generated_text)

In [None]:
past_key_values = outputs.past_key_values

In [None]:
print(type(past_key_values))

In [None]:
print(len(past_key_values))

In [None]:
past_key_value1 = (
    torch.tensor([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]], [[13, 14], [15, 16]]]]),
    torch.tensor([[[[17, 18], [19, 20]], [[21, 22], [23, 24]]], [[[25, 26], [27, 28]], [[29, 30], [31, 32]]]])
)

In [None]:
for key_states, value_states in past_key_value1:
    
    print(key_states)

In [None]:
for past_key, past_value in past_key_values:
    print(f"Key shape: {past_key.shape}")
    print(f"Value shape: {past_value.shape}")

In [None]:
past_key_values

In [None]:
past_key_values

In [None]:
output = model.generate(input_ids=next_word_ids, past_key_values=past_key_values)

In [None]:
# 继续生成文本
input_text_continued = "And they come across a huge monster"
inputs_continued = tokenizer(input_text_continued, return_tensors="pt").to(device)
inputs_continued

In [None]:
# 继续生成文本并传递以前的 KV 缓存
outputs_continued = model.generate(
    **inputs_continued,
    max_new_tokens=50,
    use_cache=True,
    return_dict_in_generate=True,
    output_scores=True,
    past_key_values=past_key_values  # 传递以前的 KV 缓存
)

# 解码生成的文本
generated_text = tokenizer.decode(outputs_continued.sequences[0], skip_special_tokens=True)
print(generated_text)

In [None]:
input_text = "'Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

In [None]:
input_ids

In [None]:
out = model.generate(input_ids,  max_new_tokens=50, use_cache=True, return_dict_in_generate=True)
past_key_values = out.past_key_values
generated_ids = out.sequences

# Now we can continue generation using cache and already generated tokens
out_continued = model.generate(generated_ids,  max_new_tokens=50,past_key_values=past_key_values, return_dict_in_generate=True)
continued_generated_ids = out_continued.sequences

In [None]:
generated_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
continued_output = tokenizer.batch_decode(continued_generated_ids, skip_special_tokens=True)

In [None]:
generated_output

In [None]:
continued_output