In [1]:
from sentencepiece import SentencePieceProcessor
from parser import GrammarConstrainedParser
from pydantic import BaseModel
from typing import Literal

In [2]:
# tokenizer = SentencePieceProcessor()
# tokenizer.Load("../Llama-2-7b/tokenizer.model")
class TestTokenizer(SentencePieceProcessor):
    def __init__(self):
        self._tokens = [
        # Structural tokens
        '{',
        '}',
        ':',
        ',',
        '"',
        # Field names
        '"name"',
        '"age"',
        '"gender"',
        '"employed"',
        # Values
        '"John"',
        '30',
        '"Male"',
        '"Female"',
        'true',
        'false',
        # Partial tokens
        '{"',
        'nam',
        'e":',
        ' "',
        'Jo',
        'hn"',
        ', "',
        'ag',
        'e": ',
        '3',
        '0, ',
        '"gen',
        'der',
        '": "',
        'Ma',
        'le"',
        ', "e',
        'mpl',
        'oyed',
        '": ',
        'tr',
        'ue',
        # Multi-token combinations
        '{"name": "',
        '{"name": "John", ',
        '"age": 30',
        ', "gender": "Male"',
        ', "employed": true',
        '}'
    ]
    def vocab_size(self):
        return len(self._tokens)
    def Decode(self, i):
        return self._tokens[i]
tokenizer = TestTokenizer()

In [3]:
class Person(BaseModel): 
    name: str
    age: int
    gender: Literal["Male", "Female"]
    employed: bool

In [4]:
grammar = r"""
STR: /"[^"]*"/
INT: /-?\d+/
FLOAT: /-?\d+\.\d*/
BOOL: "true" | "false"
NONE: "null"

LBRACE: "{"
RBRACE: "}"
LSQB: "["
RSQB: "]"
LPAREN: "("
RPAREN: ")"
COLON: ":"
COMMA: ","

start: person

person: LBRACE person_field_name COMMA person_field_age COMMA person_field_gender COMMA person_field_employed RBRACE

person_field_name: NAME COLON STR
person_field_age: AGE COLON INT
person_field_gender: GENDER COLON LITERAL_GENDER
person_field_employed: EMPLOYED COLON BOOL

NAME.1: "\"name\""
AGE.1: "\"age\""
GENDER.1: "\"gender\""
EMPLOYED.1: "\"employed\""
LITERAL_GENDER.1: "\"Male\"" | "\"Female\""

%ignore /\s+/
"""

In [7]:
parser: GrammarConstrainedParser = GrammarConstrainedParser(grammar=grammar, llm_tokenizer=tokenizer)

In [8]:
import random
for i in range(3):
    print("Parsed till now:", parser.parsed_str)
    print("Acceptable token ids:", (ids := parser.get_acceptable_llm_tokens().tolist()))
    print("Acceptable tokens:", [tokenizer.Decode(i) for i in ids])
    print("Randomly accepting token id:", (id := random.choice(ids)), f"('{tokenizer.Decode(id)}')")
    parser.accept_token(id)
    print()

Parsed till now: 
Acceptable token ids: [0, 15, 37, 38]
Acceptable tokens: ['{', '{"', '{"name": "', '{"name": "John", ']
Randomly accepting token id: 37 ('{"name": "')

Parsed till now: {"name": "
Acceptable token ids: [0, 1, 2, 3, 4, 10, 13, 14, 15, 16, 18, 19, 20, 21, 22, 24, 25, 27, 29, 30, 32, 33, 35, 36, 42]
Acceptable tokens: ['{', '}', ':', ',', '"', '30', 'true', 'false', '{"', 'nam', ' "', 'Jo', 'hn"', ', "', 'ag', '3', '0, ', 'der', 'Ma', 'le"', 'mpl', 'oyed', 'tr', 'ue', '}']
Randomly accepting token id: 21 (', "')

Parsed till now: {"name": ", "
Acceptable token ids: [3, 21]
Acceptable tokens: [',', ', "']
Randomly accepting token id: 21 (', "')



In [6]:
parser.get_acceptable_llm_tokens()

tensor([ 0, 15, 37, 38])

In [7]:
tokenizer.Decode(0), tokenizer.Decode(15), tokenizer.Decode(37), tokenizer.Decode(38)

('{', '{"', '{"name": "', '{"name": "John", ')

In [None]:
parser.accept_token(0)

In [28]:
from lark import Lark
parser: Lark = Lark(grammar, parser="lalr", lexer="basic")
interactive = parser.parse_interactive("")

In [29]:
tokens = list(parser.lex('{"name": "John", "age": 30, "gender": -1, "employed": '))

In [30]:
tokens

[Token('LBRACE', '{'),
 Token('NAME', '"name"'),
 Token('COLON', ':'),
 Token('STR', '"John"'),
 Token('COMMA', ','),
 Token('AGE', '"age"'),
 Token('COLON', ':'),
 Token('INT', '30'),
 Token('COMMA', ','),
 Token('GENDER', '"gender"'),
 Token('COLON', ':'),
 Token('INT', '-1'),
 Token('COMMA', ','),
 Token('EMPLOYED', '"employed"'),
 Token('COLON', ':')]

In [31]:
for token in tokens:
    interactive.feed_token(token)

UnexpectedToken: Unexpected token Token('INT', '-1') at line 1, column 39.
Expected one of: 
	* LITERAL_GENDER


In [276]:
interactive.accepts()

{'BOOL'}

In [277]:
tokens = list(parser.lex(' true'))

In [367]:
list(parser.lex('{  '))

[Token('LBRACE', '{')]

In [172]:
interactive.feed_token(Token('LBRACE', '{'))

In [174]:
interactive.accepts()

{'NAME'}

In [280]:
parser.terminals

[TerminalDef('BOOL', '(?:false|true)'),
 TerminalDef('STR', '"[^"]*"'),
 TerminalDef('INT', '-?\\d+'),
 TerminalDef('NAME', '"name"'),
 TerminalDef('AGE', '"age"'),
 TerminalDef('GENDER', '"gender"'),
 TerminalDef('EMPLOYED', '"employed"'),
 TerminalDef('LITERAL_GENDER', '(?:"Female"|"Male"|\\-1)'),
 TerminalDef('LBRACE', '\\{'),
 TerminalDef('RBRACE', '\\}'),
 TerminalDef('COLON', ':'),
 TerminalDef('COMMA', ','),
 TerminalDef('__IGNORE_0', '\\s+')]

In [285]:
parser._terminals_dict["BOOL"].pattern

'(?:false|true)'

In [368]:
import regex

def match_regex_prefix(prefix_string: str, regex_pattern: str) -> bool:
    if not regex_pattern.startswith('^'):
        regex_pattern = '^' + regex_pattern
    pattern = regex.compile(regex_pattern)
    return bool(pattern.fullmatch(prefix_string, partial=True))

In [360]:
import regex

p = 'ab(c|d)*e'
p = rf"\s*{p}\s*"

pattern = regex.compile(p)
s = "a"

In [361]:
match_regex_prefix(s, p)

(True, <regex.Match object; span=(0, 1), match='a', partial=True>)

In [330]:
match

<regex.Match object; span=(0, 10), match='   true   '>

In [202]:
match.partial

AttributeError: 'NoneType' object has no attribute 'partial'

In [190]:
is_valid_prefix('"My name is', r'"[^"]*"')

True