In [1]:
from sentencepiece import SentencePieceProcessor
from parser import GrammarConstrainedParser
from pydantic import BaseModel
from typing import Literal

In [2]:
class TestTokenizer(SentencePieceProcessor):
    def __init__(self):
        self._tokens = [
        # Structural tokens
        '{',
        '}',
        ':',
        ',',
        '"',
        # Field names
        '"name"',
        '"age"',
        '"gender"',
        '"employed"',
        # Values
        '"John"',
        '30',
        '"Male"',
        '"Female"',
        'true',
        'false',
        # Partial tokens
        '{"',
        'nam',
        'e":',
        ' "',
        'Jo',
        'hn"',
        ', "',
        'ag',
        'e": ',
        '3',
        '0, ',
        '"gen',
        'der',
        '": "',
        'Ma',
        'le"',
        ', "e',
        'mpl',
        'oyed',
        '": ',
        'tr',
        'ue',
        # Multi-token combinations
        '{"name": "',
        '{"name": "John", ',
        '"age": 30',
        ', "gender": "Male"',
        ', "employed": true',
        '}'
    ]
    def vocab_size(self):
        return len(self._tokens)
    def Decode(self, i):
        return self._tokens[i]
    def eos_id(self):
        return -1
tokenizer = TestTokenizer()

In [3]:
class Person(BaseModel): 
    name: str
    age: int
    gender: Literal["Male", "Female"]
    employed: bool

In [4]:
grammar = r"""
STR: /"[^"]*"/
INT: /-?\d+/
FLOAT: /-?\d+\.\d*/
BOOL: "true" | "false"
NONE: "null"

LBRACE: "{"
RBRACE: "}"
LSQB: "["
RSQB: "]"
LPAREN: "("
RPAREN: ")"
COLON: ":"
COMMA: ","

start: person

person: LBRACE person_field_name COMMA person_field_age COMMA person_field_gender COMMA person_field_employed RBRACE

person_field_name: NAME COLON STR
person_field_age: AGE COLON INT
person_field_gender: GENDER COLON LITERAL_GENDER
person_field_employed: EMPLOYED COLON BOOL

NAME.1: "\"name\""
AGE.1: "\"age\""
GENDER.1: "\"gender\""
EMPLOYED.1: "\"employed\""
LITERAL_GENDER.1: "\"Male\"" | "\"Female\""

%ignore /\s+/
"""

In [5]:
parser: GrammarConstrainedParser = GrammarConstrainedParser(grammar=grammar, llm_tokenizer=tokenizer)

In [6]:
import random
for i in range(3):
    print("Parsed till now:", parser.parsed_str)
    print("Acceptable token ids:", (ids := parser.get_acceptable_llm_tokens().tolist()))
    print("Acceptable tokens:", [tokenizer.Decode(i) for i in ids])
    print("Randomly accepting token id:", (id := random.choice(ids)), f"('{tokenizer.Decode(id)}')")
    parser.accept_token(id)
    print()

Parsed till now: 
Acceptable token ids: [0, 37, 38, 15]
Acceptable tokens: ['{', '{"name": "', '{"name": "John", ', '{"']
Randomly accepting token id: 0 ('{')

Parsed till now: {
Acceptable token ids: [18, 4, 5]
Acceptable tokens: [' "', '"', '"name"']
Randomly accepting token id: 18 (' "')

Parsed till now: { "











































































































































Acceptable token ids: [16]
Acceptable tokens: ['nam']
Randomly accepting token id: 16 ('nam')



In [None]:
parser.get_acceptable_llm_tokens()

In [None]:
tokenizer.Decode(0), tokenizer.Decode(15), tokenizer.Decode(37), tokenizer.Decode(38)

In [None]:
parser.accept_token(0)

In [None]:
from lark import Lark
parser: Lark = Lark(grammar, parser="lalr", lexer="basic")
interactive = parser.parse_interactive("")

In [None]:
tokens = list(parser.lex('{"name": "John", "age": 30, "gender": -1, "employed": '))

In [None]:
tokens

In [None]:
for token in tokens:
    interactive.feed_token(token)

In [None]:
interactive.accepts()

In [None]:
tokens = list(parser.lex(' true'))

In [None]:
list(parser.lex('{  '))

In [None]:
interactive.feed_token(Token('LBRACE', '{'))

In [None]:
interactive.accepts()

In [None]:
tokenizer = SentencePieceProcessor()
tokenizer.Load("../Llama-2-7b/tokenizer.model")

In [None]:
tokenizer.eos_id()

In [None]:
from torch import tensor
tensor(list(set([1, 2, 3])))