In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("/dataset/crosspipe/OriginModel/Llama-2-7b-chat-hf/")
model = AutoModelForCausalLM.from_pretrained("/dataset/crosspipe/OriginModel/Llama-2-7b-chat-hf/", torch_dtype=torch.float16).to("cuda:2")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
import time

In [7]:
inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)


time3 = time.time()
out = model.generate(**inputs, do_sample=True, max_new_tokens=500)
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
time4 = time.time()
time_fp32 = time4 - time3
print(f"FP32 time: {time_fp32}")
time1 = time.time()
out = model.generate(**inputs, do_sample=True, max_new_tokens=500, cache_implementation="quantized", cache_config={"nbits": 2, "backend": "quanto"})
print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
time2 = time.time()
time_quantized = time2 - time1
print(f"Quantized time: {time_quantized}")

I like rock music because it speaks to me on a deep level. I find the lyrics and melodies to be powerful and thought-provoking, and I enjoy the energy and emotion that comes through in the music. I also appreciate the diversity of rock music, which encompasses a wide range of styles and sub-genres, from classic rock to heavy metal to indie rock.
One of my favorite things about rock music is its ability to evoke strong emotions. Whether it's the raw energy of a live performance or the introspective lyrics of a ballad, rock music has a way of connecting with me on a personal level. I find myself reflecting on my own experiences and emotions as I listen to rock music, and I often find comfort in the shared experiences and emotions that the music conveys.
Another aspect of rock music that I enjoy is its ability to be both catchy and complex. Many rock songs have memorable hooks and choruses that stick in my head, but they also often have intricate melodies and rhythms that reward close lis

In [4]:
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer(["Hello, how are you?", "I don't like burgers because"], padding=True, return_tensors="pt").to("cuda:2")

# Feel free to play with generation kwargs. See (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationConfig) for more
generation_kwargs = {"do_sample": False, "temperature": 1.0, "top_p": 1.0, "max_new_tokens": 20, "min_new_tokens": 20}

# Let's generate one with quantized cache, and another with original precision cache. Then check the quality of generations
out = model.generate(**inputs, cache_implementation="quantized", cache_config={"backend": "quanto", "nbits": 4, "q_group_size": 32, "residual_length": 64})
out_fp16 = model.generate(**inputs, **generation_kwargs)

print(f"text with quant cache: {tokenizer.batch_decode(out)}")
print(f"text with fp16 cache: {tokenizer.batch_decode(out_fp16)}")



text with quant cache: ['<s> Hello, how are you?</s></s><s>\n\nI am very glad to meet you, I am a highly skilled and experienced software developer with a strong passion for creating innovative and efficient software solutions. I have a deep understanding of programming languages such as Java, Python, and C++, as well as experience with web development frameworks such as Spring and Django. I am also proficient in Agile methodologies and have experience working in a team environment.\nI am confident that my skills and experience make me a strong candidate for this position, and I am excited to bring my expertise to your team. Thank you for considering my application.\nSincerely,\n[Your Name]</s>', "<s> I don't like burgers because I don't like the taste of ground meat. I find it too bland and uninteresting. I much prefer chicken or fish burgers, or even vegetarian burgers made with beans or mushrooms.\nI also don't like the texture of ground meat. It's too dense and chewy for my liking.