In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/Yarn-Mistral-7B-128k-AWQ"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# Load model
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=True, safetensors=True)

prompt = "Tell me about AI"
prompt_template=f'''{prompt}
'''

print("*** Running model.generate:")

token_input = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    token_input,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512
)

# Get the tokens from the output, decode them, print them
token_output = generation_output[0]
text_output = tokenizer.decode(token_output)
print("LLM output: ", text_output)

"""
# Inference should be possible with transformers pipeline as well in future
# But currently this is not yet supported by AutoAWQ (correct as of September 25th 2023)
from transformers import pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])
"""


In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_dir = "TheBloke/Yarn-Mistral-7B-128k-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

model = AutoAWQForCausalLM.from_quantized(
            quant_path=model_dir,
            max_new_tokens=512,
            trust_remote_code=True,
            fuse_layers=True,
            batch_size=1,
            safetensors=True,
        )

  from .autonotebook import tqdm as notebook_tqdm
Replacing layers...: 100%|██████████| 32/32 [00:02<00:00, 12.91it/s]
Fusing layers...: 100%|██████████| 32/32 [00:00<00:00, 347.99it/s]


In [None]:
prompt = "Tell me about AI"
prompt_template=f'''{prompt}
'''

print("*** Running model.generate:")
token_input = tokenizer(
    prompt_template,
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    token_input,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    top_k=40,
    max_new_tokens=512
)