In [1]:
from flask import Flask, request, jsonify
from model_setup_new import NLPCoder # Assuming model_setup.py is in the same directory or accessible
import os
from utils import  split_xml_into_chunks, stitch_json_fragments, split_xml
from utils_v3 import create_xml_blueprint, stitch_json_from_blueprint
import json
import logging
from model_setup_new import NLPCoder # Assuming model_setup.py is in the same directory or accessible


# Configure logging
logging.basicConfig(
    level=logging.DEBUG,  # Minimum level to log
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',  # Log format
    handlers=[
        logging.FileHandler("app.log"),       # Log to a file
        logging.StreamHandler()               # Log to the console
    ]
)

# Create a logger
logger = logging.getLogger(__name__)

model_inference = None

def load_model():
    """
    Loads the NLPCoder model once when the server starts.
    """
    global model_inference
    if model_inference is None:
        print("Loading NLPCoder model... This will happen only once.")
        # Ensure the model path is correct for the environment where this server runs
        model_path = '/Users/maadi5/nlp_finetuning/master_curriculum_3000_weights_hint0.3_bestmodel_fixed'
        
        # Basic check if the path exists (optional, but good for debugging)
        if not os.path.exists(model_path):
            print(f"Warning: Model path '{model_path}' does not exist. Please verify.")
            # You might want to raise an error or handle this more robustly
            # For now, we'll proceed, assuming NLPCoder handles missing paths gracefully.

        try:
            model_inference = NLPCoder(
                model_identifier=model_path,
                load_fine_tuned=True
            )
            print("NLPCoder model loaded successfully.")
        except Exception as e:
            print(f"Error loading model: {e}")
            # Depending on the error, you might want to exit or disable inference
            model_inference = None # Ensure it's None if loading failed

  from .autonotebook import tqdm as notebook_tqdm


--- Original Real-World XML ---

<root>
    <item>
        <name>Apple</name>
    </item>
</root>


--- Normalized XML (Matches Training Style) ---
<root>
  <item>
    <name>Apple</name>
  </item>
</root>

Normalization successful!


In [2]:
THRESHOLD_FOR_SPLIT = 101
MAX_BATCH_SIZE = 2


def infer_endpoint(input_text):
    """
    Endpoint for performing inference.
    Expects a JSON payload with 'input_text'.
    """
    if model_inference is None:
        return jsonify({"error": "Model not loaded. Please check server logs."}), 500

    # data = request.get_json()
    # if not data or 'input_text' not in data:
    #     return jsonify({"error": "Invalid request. 'input_text' is required."}), 400

    # input_text = data['input_text']

    # split_chunks = split_xml(root=input_text,max_tokens=THRESHOLD_FOR_SPLIT)#(xml_str=input_text, max_tokens=THRESHOLD_FOR_SPLIT)
    blueprint, split_chunks = create_xml_blueprint(xml_string=input_text, max_tokens=THRESHOLD_FOR_SPLIT)

    # blueprint, split_xml = create_xml_blueprint(xml_text=input_text, max_tokens=THRESHOLD_FOR_SPLIT)
    if len(split_chunks)>1:

        try:
            logging.info(f"Input split into {len(split_chunks)} parts..")
            # logging.info(f"{json.dumps(split_xml, indent=2)}")

            groups = []
            group = {}
            count = 0
            for key, val in split_chunks.items():
                if count% MAX_BATCH_SIZE == 0 and count != 0:
                    groups.append(group)
                    group = {}
                group[key] = val
                count += 1

                if count == len(split_chunks):
                    groups.append(group)

            # logging.info(f"groups: {groups}")
            outputs_to_stitch = {}
            for idx, g in enumerate(groups):
                logging.info(f'Running inference on batch {idx+1}. Batch size: {len(g)}')
                keys = list(g.keys())
                values_list = list(g.values())
                print(f'input_{idx}: ', values_list[0])
                outputs = model_inference.infer_batch(values_list)
                print(f'output_{idx}: ', outputs[0])
                output_dict = {}
                for ind, k in enumerate(keys):
                    output_dict[k] = outputs[ind]
                    
                outputs_to_stitch.update(output_dict)

            # logging.info(f"Model outputs: ")
            # logging.info(f"{json.dumps(outputs_to_stitch, indent=2)}")

            # logging.info(f"Stitching model outputs...")
            # output = stitch_json_fragments(fragments=outputs_to_stitch)
            json.dump(outputs_to_stitch, open('model_outputs.json', 'w', encoding='utf8'), ensure_ascii=False)
            output = stitch_json_from_blueprint(blueprint=blueprint, processed_json_chunks= outputs_to_stitch)
        except:
            pass
        return output

In [3]:
input_text=\
'''
<dashboard _.fcp.AccessibleZoneTabOrder.true...enable-sort-zone-taborder='true' name='Area_context_filter'>
    <style />
    <size maxheight='800' maxwidth='1000' minheight='800' minwidth='1000' />
    <datasources>
    <datasource caption='Orders (Super_Store_Sales)' name='federated.01m8s430ttzqwp11ntkqx1t7bri8' />
    </datasources>
    <datasource-dependencies datasource='federated.01m8s430ttzqwp11ntkqx1t7bri8'>
    <column datatype='string' name='[Category]' role='dimension' type='nominal' />
    <column caption='Sub Category' datatype='string' name='[Sub_Category]' role='dimension' type='nominal' />
    <column-instance column='[Category]' derivation='None' name='[none:Category:nk]' pivot='key' type='nominal' />
    <column-instance column='[Sub_Category]' derivation='None' name='[none:Sub_Category:nk]' pivot='key' type='nominal' />
    </datasource-dependencies>
    <zones>
    <zone h='100000' id='4' type-v2='layout-basic' w='100000' x='0' y='0'>
        <zone h='98000' id='7' param='horz' type-v2='layout-flow' w='98400' x='800' y='1000'>
        <zone h='98000' id='5' type-v2='layout-basic' w='82400' x='800' y='1000'>
            <zone h='98000' id='3' name='Simple_area_context_filter' w='82400' x='800' y='1000'>
            <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
            </zone-style>
            </zone>
        </zone>
        <zone fixed-size='160' h='98000' id='6' is-fixed='true' param='vert' type-v2='layout-flow' w='16000' x='83200' y='1000'>
            <zone h='56250' id='8' name='Simple_area_context_filter' param='[federated.01m8s430ttzqwp11ntkqx1t7bri8].[none:Sub_Category:nk]' type-v2='filter' w='16000' x='83200' y='1000'>
            <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
            </zone-style>
            </zone>
            <zone h='16000' id='9' name='Simple_area_context_filter' param='[federated.01m8s430ttzqwp11ntkqx1t7bri8].[none:Category:nk]' type-v2='filter' w='16000' x='83200' y='57250'>
            <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
            </zone-style>
            </zone>
        </zone>
        </zone>
        <zone-style>
        <format attr='border-color' value='#000000' />
        <format attr='border-style' value='none' />
        <format attr='border-width' value='0' />
        <format attr='margin' value='8' />
        </zone-style>
    </zone>
    </zones>
    <devicelayouts>
    <devicelayout auto-generated='true' name='Phone'>
        <size maxheight='700' minheight='700' sizing-mode='vscroll' />
        <zones>
        <zone h='100000' id='11' type-v2='layout-basic' w='100000' x='0' y='0'>
            <zone h='98000' id='10' param='vert' type-v2='layout-flow' w='98400' x='800' y='1000'>
            <zone h='56250' id='8' mode='checkdropdown' name='Simple_area_context_filter' param='[federated.01m8s430ttzqwp11ntkqx1t7bri8].[none:Sub_Category:nk]' type-v2='filter' w='16000' x='83200' y='1000'>
                <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
                <format attr='padding' value='0' />
                </zone-style>
            </zone>
            <zone h='16000' id='9' mode='checkdropdown' name='Simple_area_context_filter' param='[federated.01m8s430ttzqwp11ntkqx1t7bri8].[none:Category:nk]' type-v2='filter' w='16000' x='83200' y='57250'>
                <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
                <format attr='padding' value='0' />
                </zone-style>
            </zone>
            <zone fixed-size='280' h='98000' id='3' is-fixed='true' name='Simple_area_context_filter' w='82400' x='800' y='1000'>
                <zone-style>
                <format attr='border-color' value='#000000' />
                <format attr='border-style' value='none' />
                <format attr='border-width' value='0' />
                <format attr='margin' value='4' />
                <format attr='padding' value='0' />
                </zone-style>
            </zone>
            </zone>
            <zone-style>
            <format attr='border-color' value='#000000' />
            <format attr='border-style' value='none' />
            <format attr='border-width' value='0' />
            <format attr='margin' value='8' />
            </zone-style>
        </zone>
        </zones>
    </devicelayout>
    </devicelayouts>
    <simple-id uuid='{2D1B3BF2-337D-4CC5-8B7B-007CBBACE9BA}' />
</dashboard>
'''

In [4]:
load_model()

Loading NLPCoder model... This will happen only once.
NLPCoder model loaded successfully.


In [5]:
# output = infer_endpoint(input_text=input_text)

In [6]:
from utils_v4 import XmlToJsonPipeline


def real_inference_adapter(chunks_to_process: dict) -> dict:
    """
    An adapter function that matches the signature required by XmlToJsonPipeline.
    It takes a dictionary of chunks, handles batching, calls the real model,
    and returns a dictionary of results.
    """
    if not chunks_to_process:
        return {}

    logging.info(f"Preparing to run inference on {len(chunks_to_process)} chunks.")
    
    # Your existing batching logic
    groups = []
    group = {}
    count = 0
    for key, val in chunks_to_process.items():
        if count % MAX_BATCH_SIZE == 0 and count != 0:
            groups.append(group)
            group = {}
        group[key] = val
        count += 1
    if group: # Add the last group if it's not empty
        groups.append(group)

    outputs_to_stitch = {}
    for idx, g in enumerate(groups):
        logging.info(f'Running inference on batch {idx+1}. Batch size: {len(g)}')
        keys = list(g.keys())
        values_list = list(g.values())
        
        # This is the actual call to your model
        outputs = model_inference.infer_batch(values_list)
        
        # The model returns a list of JSON strings. We map them back to their chunk_ids.
        for i, key in enumerate(keys):
            outputs_to_stitch[key] = outputs[i]
            
    logging.info(f"Model inference complete for all batches.")
    logging.info(f"outputs to stitch: {outputs_to_stitch}")
    return outputs_to_stitch


def infer_endpoint2(input_text):
    """
    Endpoint for performing inference using the new pipeline.
    """
    if model_inference is None:
        return jsonify({"error": "Model not loaded. Please check server logs."}), 500

    # data = request.get_json()
    # if not data or 'input_text' not in data:
    #     return jsonify({"error": "Invalid request. 'input_text' is required."}), 400

    # input_text = data['input_text']

    try:
        # 1. Instantiate the pipeline, injecting our real inference function
        pipeline = XmlToJsonPipeline(
            inference_function=real_inference_adapter,
            list_split_threshold=THRESHOLD_FOR_SPLIT
        )

        # 2. Run the entire process with one simple call
        final_output = pipeline.process(input_text)

        return final_output
    except:
        pass

In [7]:
output = infer_endpoint2(input_text=input_text)

2025-07-24 14:10:39,096 - root - INFO - Preparing to run inference on 31 chunks.
2025-07-24 14:10:39,096 - root - INFO - Running inference on batch 1. Batch size: 2


--- Step 1: Splitting XML and Building Blueprint ---
Blueprint created with 31 nodes.
XML split into 31 chunks for processing.

{
  "chunk_entity_5cdd9714": "<dashboard _.fcp.AccessibleZoneTabOrder.true...enable-sort-zone-taborder=\"true\" name=\"Area_context_filter\">\n    <style /><size /><datasources /><datasource-dependencies /><zones /><devicelayouts /><simple-id /></dashboard>",
  "chunk_entity_6e012be9": "<style />",
  "chunk_entity_3d1516f7": "<size maxheight=\"800\" maxwidth=\"1000\" minheight=\"800\" minwidth=\"1000\" />",
  "chunk_entity_68128a15": "<datasources>\n    <datasource caption=\"Orders (Super_Store_Sales)\" name=\"federated.01m8s430ttzqwp11ntkqx1t7bri8\" />\n    </datasources>",
  "chunk_entity_79b4ddaa": "<datasource-dependencies datasource=\"federated.01m8s430ttzqwp11ntkqx1t7bri8\">\n    <column datatype=\"string\" name=\"[Category]\" role=\"dimension\" type=\"nominal\" />\n    <column caption=\"Sub Category\" datatype=\"string\" name=\"[Sub_Category]\" role=\"d

2025-07-24 14:10:42,286 - root - INFO - Running inference on batch 2. Batch size: 2
2025-07-24 14:10:45,769 - root - INFO - Running inference on batch 3. Batch size: 2
2025-07-24 14:10:50,499 - root - INFO - Running inference on batch 4. Batch size: 2
2025-07-24 14:10:54,852 - root - INFO - Running inference on batch 5. Batch size: 2
2025-07-24 14:10:58,914 - root - INFO - Running inference on batch 6. Batch size: 2
2025-07-24 14:11:04,337 - root - INFO - Running inference on batch 7. Batch size: 2
2025-07-24 14:11:10,901 - root - INFO - Running inference on batch 8. Batch size: 2
2025-07-24 14:11:17,681 - root - INFO - Running inference on batch 9. Batch size: 2
2025-07-24 14:11:20,052 - root - INFO - Running inference on batch 10. Batch size: 2
2025-07-24 14:11:22,397 - root - INFO - Running inference on batch 11. Batch size: 2
2025-07-24 14:11:26,331 - root - INFO - Running inference on batch 12. Batch size: 2
2025-07-24 14:11:35,232 - root - INFO - Running inference on batch 13. Ba

Inference complete.

--- Step 3: Reconstructing JSON from Blueprint ---
Reconstruction complete.


In [8]:
output

{'dashboard': {'fcp.AccessibleZoneTabOrder.true...enable-sort-zone-taborder': 'true',
  'name': 'Area_context_filter',
  'style': {'style': []},
  'size': {'maxheight': '800',
   'maxwidth': '1000',
   'minheight': '800',
   'minwidth': '1000'},
  'datasources': {'datasource': ['Orders (Super_Store_Sales)',
    'Federated.01m8s430ttzqwp11ntkqx1t7bri8']},
  'datasource-dependencies': {'datasource': 'federated.01m8s430ttzqwp11ntkqx1t7bri8',
   'dfe7c9d9': ['Category', 'Sub Category', 'Derivation', 'Pivot', 'key']},
  'zones': {'zone': [{'h': '100000',
     'id': 4,
     'type-v2': 'layout-basic',
     'w': '100000',
     'x': '0',
     'y': '0',
     'zone': {'h': 98000,
      'id': 7,
      'param': 'horz',
      'type-v2': 'layout-flow',
      'w': '98400',
      'x': '800',
      'y': '1000',
      'zone': [{'h': 98000,
        'id': 5,
        'type-v2': 'layout-basic',
        'w': '82400',
        'x': '800',
        'y': '1000',
        'zone': {'h': 98000,
         'id': 3,
     

In [9]:
print(json.dumps({'chunk_entity_5cdd9714': '{"dashboard": {"fcp.AccessibleZoneTabOrder.true...enable-sort-zone-taborder": "true", "name": "Area_context_filter"}}', 'chunk_entity_6e012be9': '{"style": {"style": []}}', 'chunk_entity_3d1516f7': '{"size": {"maxheight": "800", "maxwidth": "1000", "minheight": "800", "minwidth": "1000"}}', 'chunk_entity_68128a15': '{"datasources": {"datasource": ["Orders (Super_Store_Sales)", "Federated.01m8s430ttzqwp11ntkqx1t7bri8"]}}', 'chunk_entity_79b4ddaa': '{"datasource-dependencies": {"datasource": "federated.01m8s430ttzqwp11ntkqx1t7bri8", "dfe7c9d9": ["Category", "Sub Category", "Derivation", "Pivot", "key"]}}', 'chunk_entity_854bdfc1': '{"zones": {"zone": []}}', 'chunk_entity_cde10cec': '{"zone": {"h": "100000", "id": 4, "type-v2": "layout-basic", "w": "100000", "x": "0", "y": "0"}}', 'chunk_entity_da85ff19': '{"zone": {"h": 98000, "id": 7, "param": "horz", "type-v2": "layout-flow", "w": "98400", "x": "800", "y": "1000"}}', 'chunk_entity_69dc841a': '{"zone": {"h": 98000, "id": 5, "type-v2": "layout-basic", "w": "82400", "x": "800", "y": "1000"}}', 'chunk_entity_b145202f': '{"zone": {"h": 98000, "id": 3, "name": "Simple_area_context_filter", "w": "82400", "x": "800", "y": "1000"}}', 'chunk_entity_719f11d3': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin"]}}', 'chunk_entity_6538d270': '{"zone": {"fixed-size": 160, "h": "98000", "id": 6, "is-fixed": "true", "param": "vert", "type-v2": "layout-flow", "w": "16000", "x": "83200", "y": "1000"}}', 'chunk_entity_27b8d8ed': '{"zone": {"h": 56250, "id": 8, "name": "Simple_area_context_filter", "param": "federated.01m8s430ttzqwp11ntkqx1t7bri8.", "type-v2": "filter", "w": "16000", "x": "83200", "y": "1000"}}', 'chunk_entity_9fc1c148': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin"]}}', 'chunk_entity_2e61da33': '{"zone": {"h": 16000, "id": "9", "name": "Simple_area_context_filter", "param": "federated.01m8s430ttzqwp11ntkqx1t7bri8.", "type-v2": "filter", "w": "16000", "x": "83200", "y": "57250"}}', 'chunk_entity_da50f19f': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin"]}}', 'chunk_entity_e3a30f9d': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin"]}}', 'chunk_entity_f5235ab5': '{"devicelayouts": {"devicelayout": []}}', 'chunk_entity_9846cba4': '{"devicelayout": {"auto-generated": "true", "name": "Phone"}}', 'chunk_entity_a85653ee': '{"size": {"maxheight": "700", "minheight": "700", "sizing-mode": "vscroll"}}', 'chunk_entity_57ba0f2f': '{"zones": {"zone": []}}', 'chunk_entity_1937bf98': '{"zone": {"h": 100000, "id": "11", "type-v2": "layout-basic", "w": "100000", "x": "0", "y": "0"}}', 'chunk_entity_ff5fb238': '{"zone": {"h": 98000, "id": 10, "param": "vert", "type-v2": "layout-flow", "w": "98400", "x": "800", "y": "1000"}}', 'chunk_entity_945bc548': '{"zone": {"h": 56250, "id": 8, "mode": "checkdropdown", "name": "Simple_area_context_filter", "param": "federated.01m8s430ttzqwp11ntkqx1t7bri8.", "type-v2": "filter", "w": "16000", "x": "83200", "y": "1000"}}', 'chunk_entity_6fce5a21': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin", "padding"]}}', 'chunk_entity_ce4dd761': '{"zone": {"h": 16000, "id": "9", "mode": "checkdropdown", "name": "Simple_area_context_filter", "param": "federated.01m8s430ttzqwp11ntkqx1t7bri8.", "type-v2": "filter", "w": 16000, "x": "83200", "y": "57250"}}', 'chunk_entity_aad4b5e6': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin", "padding"]}}', 'chunk_entity_070e8197': '{"zone": {"fixed-size": 280, "h": "98000", "id": "3", "name": "Simple_area_context_filter", "w": "82400", "x": "800", "y": "1000"}}', 'chunk_entity_387d540d': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin", "padding"]}}', 'chunk_entity_c0f3407b': '{"zone-style": {"formatattr": ["border-color", "border-style", "border-width", "margin"]}}', 'chunk_entity_3351fd4c': '{"simple_id": {"uuid": "2D1B3BF2-337D-4CC5-8B7B-007CBBACE9BA"}}'}, indent = 2))


{
  "chunk_entity_5cdd9714": "{\"dashboard\": {\"fcp.AccessibleZoneTabOrder.true...enable-sort-zone-taborder\": \"true\", \"name\": \"Area_context_filter\"}}",
  "chunk_entity_6e012be9": "{\"style\": {\"style\": []}}",
  "chunk_entity_3d1516f7": "{\"size\": {\"maxheight\": \"800\", \"maxwidth\": \"1000\", \"minheight\": \"800\", \"minwidth\": \"1000\"}}",
  "chunk_entity_68128a15": "{\"datasources\": {\"datasource\": [\"Orders (Super_Store_Sales)\", \"Federated.01m8s430ttzqwp11ntkqx1t7bri8\"]}}",
  "chunk_entity_79b4ddaa": "{\"datasource-dependencies\": {\"datasource\": \"federated.01m8s430ttzqwp11ntkqx1t7bri8\", \"dfe7c9d9\": [\"Category\", \"Sub Category\", \"Derivation\", \"Pivot\", \"key\"]}}",
  "chunk_entity_854bdfc1": "{\"zones\": {\"zone\": []}}",
  "chunk_entity_cde10cec": "{\"zone\": {\"h\": \"100000\", \"id\": 4, \"type-v2\": \"layout-basic\", \"w\": \"100000\", \"x\": \"0\", \"y\": \"0\"}}",
  "chunk_entity_da85ff19": "{\"zone\": {\"h\": 98000, \"id\": 7, \"param\": \"horz\