In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import guidance
import json 
from dotenv import load_dotenv

import grammar_guide as gg

load_dotenv()
set_seed(42)

In [2]:
model_name_or_path = "HuggingFaceTB/SmolLM-135M"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:
from string import Template
 
num_json_keys = 6 # The number of JSON keys we want to generate (not telling the Transformers model this!)
lark_gramar_str = Template(open("../examples/benchmarks/json.lark").read()).safe_substitute(
    NUM_REPEATS=f"{num_json_keys-1}"
)

res = gg.guide(
    model=model,
    tokenizer=tokenizer,
    parser=gg.load_parser(lark_gramar_str),
    prompt="Here's a JSON object with only string values:",
    draft_model=guidance.models.Transformers(
        model_name_or_path, echo=False
    ),
    max_new_tokens=19,
    max_grammar_corrections=5,
    verbose=True,
    token_healing=True,
    temperature=0.0
)
print(res.process_time_seconds)
print(len(tokenizer(res.response)['input_ids']) / res.process_time_seconds)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


3.6683290004730225
14.720598941108289


In [9]:
try:
    print(json.dumps(json.loads(res.response), indent = 4))
except:
    print(repr(res.response))

{
    "name": "John",
    "age": "25",
    "city": "New York",
    "address": "123 Main St",
    "phone": "123-555-12"
}


So what exactly happened above?

1) The model generated a JSON object with 3 keys (name, age, and city)
2) Our grammar catches that a numeric value is invalid (`"age": 25`) and backtracks the KV cache
3) At this point - we only have 1 possible candidate from our grammar (`"`) so we feed that in via a forward pass, and then continue generating
3) We generate up to `..."-7890", "`, and our grammar constrains our draft model to generate a valid continuation (`email"`)
4) Finally, the grammar return the final prefix that satisfies that Lark grammar conditions