In [None]:
import guidance
from guidance import models, gen, one_or_more, select, zero_or_more, regex, optional, capture

model_name = "your_model_name_here"

model = models.LlamaCpp(f"{model_name}.gguf", n_gpu_layers=-1, n_ctx=2048)

In [None]:
instruction_generate_burger = """You are a helpful assistant. You have to take as input a customer order and output a list of the corresponding objects. You should use only the following classes in Python:
      class Topping:
            def __init__(self, name: str, qualifier: Optional[str] = None, negation: Optional[bool] = False) -> None:
            
      class BurgerOrder:
            def __init__(self, number: int = 1, size: Optional[str] = None, main_dish_type: Optional[str] = None, toppings: Optional[List[Topping]] = None) -> None
       
      class DrinkOrder:
            def __init__(self, number: int = 1, drink_type: Optional[str] = None, size: Optional[str] = None) -> None :
      
      class SideOrder:
            def __init__(self, number: int = 1, side_type: Optional[str] = None, size: Optional[str] = None) -> None :
      
      The output should be a list of those objects.\n
      Here's an example:
      'input': 'i would like a vegan burger with lettuce tomatoes and onions and a large order of sweet potato fries',
      'output': '[BurgerOrder(number=1, main_dish_type='vegan_burger', toppings=[Topping(name='lettuce'), Topping(name='tomato'), Topping(name='onion')]), SideOrder(number=1, side_type='french_fries', size='large')]\n',"""

In [None]:
import json
import spacy
from spacy.pipeline import EntityRuler
from spacy.language import Language
import os
import re

def parse_line(line):
    parts = line.strip().split('\t')
    if len(parts) != 2:
        return None, None
    phrase, category = parts
    return phrase, category.strip()

def get_all_ngrams(tokens, max_n):
    ngrams = set()
    for n in range(1, max_n + 1):
        for i in range(len(tokens) - n + 1):
            ngram = " ".join([token.text for token in tokens[i:i + n]])
            ngrams.add(ngram)
    return ngrams

def init_pipeline(dataset="burger"):
    nlp = spacy.load("en_core_web_sm")
    
    ner_ruler = nlp.add_pipe("entity_ruler", before="ner", config={"phrase_matcher_attr": "LOWER"})
    
    def read_file_categories(food_type):
        file_path = f"FoodOrderingDataset/data/{food_type}/alias"
        text_files = [f for f in os.listdir(file_path) if f.endswith('.txt')]
        patterns = []
        for file in text_files:
            path_to_file = os.path.join(file_path, file)
            with open(path_to_file, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        phrase, category_info = parse_line(line)
                        if phrase and category_info:
                            schema = {"pattern": phrase, "label": category_info}
                            patterns.append(schema)
        return patterns

    category_patterns = read_file_categories(dataset)
    ner_ruler.add_patterns(category_patterns)

    global DRINK_KEYWORDS, SIDE_KEYWORDS
    DRINK_KEYWORDS = {pat["pattern"].lower() for pat in category_patterns if pat["label"].startswith("DRINK_TYPE")}
    SIDE_KEYWORDS = {pat["pattern"].lower() for pat in category_patterns if pat["label"].startswith("SIDE_TYPE")}
    
    nlp.add_pipe("disambiguate_size", after="ner")
    
    return nlp

@Language.component("disambiguate_size")
def disambiguate_size(doc):
    new_ents = []
    
    max_drink = max((len(phrase.split()) for phrase in DRINK_KEYWORDS), default=1)
    max_side = max((len(phrase.split()) for phrase in SIDE_KEYWORDS), default=1)
    max_n = max(max_drink, max_side)
    
    for ent in doc.ents:
        if "SIZE" in ent.label_:
            start = max(0, ent.start)
            end = min(len(doc), ent.end + 0)
            context_tokens = []
            while end <= len(doc):
                context_tokens = [token for token in doc[start:end]]
                context_ngrams = get_all_ngrams(context_tokens, max_n)
                if context_ngrams & DRINK_KEYWORDS:
                    new_ent = spacy.tokens.Span(doc, ent.start, ent.end, label="DRINK_" + ent.label_)
                    new_ents.append(new_ent)
                    break
                elif context_ngrams & SIDE_KEYWORDS:
                    new_ent = spacy.tokens.Span(doc, ent.start, ent.end, label="SIDE_" + ent.label_)
                    new_ents.append(new_ent)
                    break
                else:
                    if end == len(doc) and start == ent.start:
                        start -= 1
                    else:
                        end += 1
        else:
            new_ents.append(ent)
    doc.ents = tuple(new_ents)
    return doc

def process_NER(input_order):
    found_categories = []
    doc = nlp(input_order)
    for ent in doc.ents:
        found_categories.append((ent.text, ent.label_))
    return found_categories

nlp = init_pipeline()

In [None]:
import json
import re
from collections import defaultdict

file_path = f'FoodOrderingDataset/output/Ablation_NER_Burger_Results_{model_name}.json'

existing_data = []

with open('FoodOrderingDataset/processed_data/burger_dataset_disambiguation.json', 'r') as file:
    data = json.load(file)

input_list = []
for obj in data:
    input_value = obj.get("input", "No input key found")
    output_value = obj.get("output_extract", "No output key found")
    output_generate = obj.get("output_generate", "No output key found")
    
    used_items_value = process_NER(input_value)
    used_items_value_decoupled = [f"{ent_text} - {ent_label}" for ent_text, ent_label in used_items_value]
    used_items_str = ', '.join(used_items_value_decoupled).lower()

    input_augmented_file = input_value + "\nItems Found: " + used_items_str
    input_list.append((input_value, input_augmented_file, output_generate, used_items_value, used_items_str))

for i in range(len(input_list)):
    if i > 130:
        break

    if i % 10 == 0:
        print("Iteration " + str(i))
        
    (initial_input, input_augmented, expected, used_items_value, used_items_str) = input_list[i]
    
    lm = model + f"""\ 
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    {instruction_generate_burger}
    ### Input:
    {input_augmented}

    ### Response:
    """
    
    ans = lm + gen("ans", max_tokens=150, stop='\n')
    
    existing_data.append({
        "input": initial_input,
        "input_augmented": input_augmented,
        "output": ans["ans"],
        "expected": expected,
        "output_NER": used_items_str
    })

with open(file_path, 'w') as file:
    json.dump(existing_data, file, indent=4)


In [None]:
import re

def canonicalize_order(order_str):
    order_str = re.sub(r",\s*(size|container_type)='None'", "", order_str)

    def clean_topping(match):
        inner = match.group(1)
        inner = re.sub(r",\s*qualifier='None'", "", inner)
        inner = re.sub(r",\s*negation=False", "", inner)
        return f"Topping({inner.strip()})"
    order_str = re.sub(r"Topping\((.*?)\)", clean_topping, order_str)

    m = re.match(r"(\w+Order)\((.*)\)", order_str)
    if m:
        order_type, params_str = m.groups()
        params = []
        current = ""
        paren_count = 0
        for ch in params_str:
            if ch == "(":
                paren_count += 1
            elif ch == ")":
                paren_count -= 1
            if ch == "," and paren_count == 0:
                params.append(current.strip())
                current = ""
            else:
                current += ch
        if current.strip():
            params.append(current.strip())
        param_dict = {}
        for param in params:
            if '=' in param:
                key, value = param.split('=', 1)
                param_dict[key.strip()] = value.strip()
        for key in list(param_dict.keys()):
            if param_dict[key] == "'None'":
                del param_dict[key]
        if order_type == "BurgerOrder":
            canon_order = ["number", "main_dish_type", "toppings"]
        elif order_type == "SideOrder":
            canon_order = ["number", "side_type", "size"]
        elif order_type == "DrinkOrder":
            canon_order = ["number", "drink_type", "size"]
        else:
            canon_order = list(param_dict.keys())
        extra = sorted(set(param_dict.keys()) - set(canon_order))
        final_order = canon_order + extra
        new_params = []
        for key in final_order:
            if key in param_dict:
                new_params.append(f"{key}={param_dict[key]}")
        return f"{order_type}(" + ", ".join(new_params) + ")"
    else:
        return order_str

def split_orders(s):
    s = s.strip()
    if s.startswith("[") and s.endswith("]"):
        s = s[1:-1]
    orders = []
    current = ""
    paren_count = 0
    for char in s:
        if char == "(":
            paren_count += 1
        elif char == ")":
            paren_count -= 1
        if char == "," and paren_count == 0:
            orders.append(current.strip())
            current = ""
        else:
            current += char
    if current.strip():
        orders.append(current.strip())
    orders = [canonicalize_order(o) for o in orders]
    return "[" + ", ".join(orders) + "]"

In [None]:
import json

def calculate_accuracy_and_save_mismatches(json_file, output_file):
    with open(json_file, 'r') as file:
        data = json.load(file)

    total = len(data)
    correct = 0
    mismatches = []

    for item in data:
        print(split_orders(item['output']))
        output = split_orders(item['output'])
        expected = split_orders(item['expected'])

        if output == expected:
            correct += 1
        else:
            mismatches.append(item)

    accuracy = (correct / total) * 100

    with open(output_file, 'w') as outfile:
        json.dump(mismatches, outfile, indent=4)

    return accuracy

json_file = f'FoodOrderingDataset/output/Ablation_NER_Burger_Results_{model_name}.json'
mismatch_file = f'FoodOrderingDataset/output/Ablation_NER_Burger_Mismatches_{model_name}.json'

accuracy = calculate_accuracy_and_save_mismatches(json_file, mismatch_file)
print(f"Accuracy: {accuracy:.2f}%")
print(f"Mismatches have been saved to: {mismatch_file}")