In [None]:
from grammarllm import generate_grammar_parameters, generate_text, get_parsing_table_and_map_tt, create_prompt, chat_template
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

## *Hierachical Classification Example*

In [None]:
# Define the Grammar Productions
productions = { 'S*': ["<<positive >> A", "<<negative >> B", "<<neutral >> C"],
                'A': ["<<happy>>", "<<peaceful>>", "<<joyful>>"],
                'B': ['<<sad>>', '<<angry>>', '<<frustrated>>'],
                'C': ['<<calm>>', '<<indifferent>>', '<<unemotional>>']
                }

In [None]:
# Define the system prompt and examples for the classification task
system_prompt = """You are a hierarchical classification assistant. Your task is to classify the user input 
                    into one of the following hierarchical categories as shown in the followig examples\n\n"""

# Define the examples for the classification task
examples = [
    {"role": "user", "content": "I just got a promotion!"},
    {"role": "assistant", "content": "positive joyful"},

    {"role": "user", "content": "Nothing ever goes my way."},
    {"role": "assistant", "content": "negative frustrated"},

    {"role": "user", "content": "The lake was still and quiet."},
    {"role": "assistant", "content": "neutral calm"},

    {"role": "user", "content": "I miss my family so much."},
    {"role": "assistant", "content": "negative sad"}
]

prompt=create_prompt(
    prompt_input="It's raining and I feel a bit down.",
    system_prompt=system_prompt,
    examples=examples
)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# Generate the parsing table and terminal token mapping
pars_table, map_terminal_tokens = get_parsing_table_and_map_tt(tokenizer, productions)

# Generate the grammar parameters
LogitProcessor, Streamer = generate_grammar_parameters(
    tokenizer, pars_table, map_terminal_tokens
)

# Generate the output constrained themodel via GrammarLLM 
output = generate_text(model, tokenizer, prompt, LogitProcessor, Streamer, chat_template)
print(output) 

## *Vocabulary Restriction Example*

In [None]:
# Define the Grammar Productions
productions = {
'S*': [
    "<< Yes>> S*",
    "<< I'm>> S*",
    "<< very>> S*",
    "<< happy>> S*",
    "<< !>> S*",
    "<< so>> S*",
    "<< really>> S*",
    "<< excited>> S*",
    "<< today>> S*",
    "<< thanks>> S*",
    "<< you>> S*",
    "<< much>> S*",
    "<< great>> S*",
    "<< good>> S*",
    "<< fine>> S*",
    "<< amazing>> S*",
    ]
}

In [None]:

# Define the system prompt and examples for the classification task
system_prompt = """You are a text generation assistant. When generating responses, you must use only the words that
                    appear in the provided examples below. You should not introduce any new words outside of those examples."""


examples = [
    {"role": "user", "content": "How are you?"},
    {"role": "assistant", "content": "I'm very happy"},

    {"role": "user", "content": "Is everything okay?"},
    {"role": "assistant", "content": "Yes, I'm so excited!"},

    {"role": "user", "content": "How was your day?"},
    {"role": "assistant", "content": "I'm really happy today!"},

    {"role": "user", "content": "Do you feel good?"},
    {"role": "assistant", "content": "Yes, I feel great, thanks!"},

    {"role": "user", "content": "What's up?"},
    {"role": "assistant", "content": "I'm fine, thank you so much!"},

    {"role": "user", "content": "Anything special today?"},
    {"role": "assistant", "content": "I'm very excited and happy today!"}
    ]

prompt=create_prompt(
    prompt_input="Say something of positive:",
    system_prompt=system_prompt,
    examples=examples
    )

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# Generate the parsing table and terminal token mapping
pars_table, map_terminal_tokens = get_parsing_table_and_map_tt(tokenizer, productions)

# Generate the grammar parameters
LogitProcessor, Streamer = generate_grammar_parameters(
    tokenizer, pars_table, map_terminal_tokens
)

# Generate the output constrained themodel via GrammarLLM 
output = generate_text(model, tokenizer, prompt, LogitProcessor, Streamer, chat_template)
print(output) 

## *Structured Generation Example* 

In [None]:
# Define the Grammar Productions
productions = {
        'S*': ["SUBJ PRED OBJ . S*"],
        'SUBJ': ["IRI", "BLANKNODE"],
        'PRED': ["IRI"],
        'OBJ': ["IRI", "BLANKNODE", "LITERAL"],
        'IRI': ["< URI >"],
        'BLANKNODE': ["<<_:>> NAME"],
        'LITERAL': ["\" STRING \" DESCRIPTION_LANG"],
        'DESCRIPTION_LANG': ["^^ IRI", "@ LANGTAG", "ε"],

        'URI': [
            # People
            "<<http://example.org/people/MarioRossi>>",
            "<<http://example.org/people/LuisaVerdi>>",
            "<<http://example.org/people/GiovanniBianchi>>",

            # Properties
            "<<http://example.org/properties/hasAge>>",
            "<<http://example.org/properties/hasProfession>>",
            "<<http://example.org/properties/hasSalary>>",

            # Other types or datatypes
            "<<http://www.w3.org/2001/XMLSchema#decimal>>",
            "<<http://www.w3.org/2001/XMLSchema#integer>>",
            "<<http://www.w3.org/2001/XMLSchema#string>>"
        ],

        'STRING':["alfanum STRING", "ε"],
        'NAME': ["ids NAME_C"],
        'NAME_C': ["idc NAME_C", "ε"],

        'LANGTAG': ['<<it >>', '<<en >>', '<<fr >>', '<<sp >>']
}

In [None]:

# Define regular expressions for terminal tokens
regex_alfanum = re.compile(r"[a-zA-Z0-9]+")  # es. "abc123"
regex_right_round_bracket = re.compile(r"\)$")  # match only ')'
regex_left_round_bracket = re.compile(r"\($")  # match only '('
regex_less_than = re.compile(r"^<$") # match only '<'
regex_greater_than = re.compile(r"^>$") # match only '>'
regex_double_quote = re.compile(r'^\"$') # match only '"'
regex_datatype = re.compile(r"^\^\^$")   # match only '^^'
regex_langtag = re.compile(r"^@$")       # match only '@'
regex_dot = re.compile(r"^\.$")  # match only '.'

# Starting identifier: must start with a letter or an underscore
regex_ids = re.compile(r'[A-Za-z_][A-Za-z0-9_-]*')
# Continuation identifier: cannot start with a letter or an underscore
regex_idc = re.compile(r'(?![A-Za-z_])[0-9_-][A-Za-z0-9_-]*')


regex_dict = {
    'regex_alfanum': regex_alfanum,
    'regex_)': regex_right_round_bracket,
    'regex_(': regex_left_round_bracket,
    'regex_<': regex_less_than,
    'regex_>': regex_greater_than,
    'regex_"': regex_double_quote,
    'regex_^^': regex_datatype,
    'regex_@': regex_langtag,
    'regex_.': regex_dot,

    'regex_ids':regex_ids,
    'regex_idc':regex_idc

    }

In [None]:
# Define the system prompt and examples for the classification task
system_prompt = """You are an assistant that converts natural language sentences into RDF triples syntax.

Follow these rules:

1. Use URIs (`<...>`) for:
- Identifiable entities such as people, properties, or concepts.
- Example:
    <http://example.org/people/MarioRossi> <http://example.org/properties/hasFriend> <http://example.org/people/LuisaVerdi> .

2. Use literals (`"..."`) for:
- Plain values such as professions, cities, names, numbers, dates, or booleans.
- Add datatypes (`^^<...>`) or language tags (`@lang`) if needed.
- Examples:
    "engineer"@en  
    "40"^^<http://www.w3.org/2001/XMLSchema#integer>

3. Use blank nodes (`_:`) only if:
- The object is anonymous and has internal structure (i.e., it has its own properties).
- Example:
    <http://example.org/people/MarioRossi> <http://example.org/properties/hasAddress> _:b1 .
    _:b1 <http://example.org/properties/street> "Via Roma" .
    _:b1 <http://example.org/properties/city> "Milano" .

Never use a blank node (`_:`) for simple values like "engineer" or "teacher". Use a literal (`"..."`) instead.

Now use the following examples to generate clean and correct RDF triples from user input."""

In [None]:
examples = [
    {"role": "user", "content": "Mario Rossi is 40 years old."},
    {"role": "assistant", "content": "<http://example.org/people/MarioRossi> <http://example.org/properties/hasAge> \"40\" ^^<http://www.w3.org/2001/XMLSchema#integer> ."},

    {"role": "user", "content": "Luisa Verdi is an engineer."},
    {"role": "assistant", "content": "<http://example.org/people/LuisaVerdi> <http://example.org/properties/hasProfession> \"engineer\" @en ."},

    {"role": "user", "content": "Giovanni Bianchi earns 55000."},
    {"role": "assistant", "content": "<http://example.org/people/GiovanniBianchi> <http://example.org/properties/hasSalary> \"55000\" ^^<http://www.w3.org/2001/XMLSchema#decimal> ."},

    {"role": "user", "content": "Mario Rossi has an anonymous node as a contact."},
    {"role": "assistant", "content": "<http://example.org/people/MarioRossi> <http://example.org/properties/hasContact> _:ids ."},

    {"role": "user", "content": "Mario Rossi has the profession of teacher."},
    {"role": "assistant", "content": "<http://example.org/people/MarioRossi> <http://example.org/properties/hasProfession> \"teacher\" @en ."}
]

prompt=create_prompt(
    prompt_input="Giovanni Bianchi was born 30 years ago.",
    system_prompt=system_prompt,
    examples=examples
)

# Initialize tokenizer
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# Generate grammar parameters
pars_table, map_terminal_tokens = get_parsing_table_and_map_tt(
    tokenizer, 
    productions=productions, 
    regex_dict=regex_dict,
)

LogitProcessor, Streamer = generate_grammar_parameters(tokenizer, pars_table, map_terminal_tokens)
output = generate_text(model, tokenizer, prompt, LogitProcessor, Streamer, chat_template)
print(output)