In [1]:
import pandas as pd
import re

In [3]:

# 1) Load your data
df = (
    pd.read_excel("test.xlsx", engine="openpyxl", dtype=str)
      .fillna("")  # NaN → ""
)

# 2) Node class
class Node:
    def __init__(self, code, desc):
        self.code     = code
        self.desc     = desc
        self.children = []
    def __repr__(self):
        return f"{self.code + ' – ' if self.code else ''}{self.desc}"

# 3) Build the tree using a stack of (node, dash_cnt)
roots = []
stack = []  # elements are (node, dash_cnt)

for _, row in df.iterrows():
    raw_code = row["HS Code"].replace(" ", "")
    lvl_str  = row["Level"].strip()           # e.g. "", "-", "--", "---", etc.
    desc      = row["Item Description"].strip()
    dash_cnt  = len(lvl_str)

    node = Node(raw_code, desc)

    # Case A: a new root (4-digit code, no dashes)
    if dash_cnt == 0 and len(raw_code) == 4:
        roots.append(node)
        stack = [(node, 0)]
        continue

    # Case B: a “container” or code with dashes
    if dash_cnt > 0:
        # find the proper parent: first in reversed stack with dash_cnt_parent < dash_cnt
        for parent, pd in reversed(stack):
            if pd < dash_cnt:
                parent.children.append(node)
                break
        else:
            # fallback to the very first root if none found
            roots[0].children.append(node)
            parent = roots[0]

        # prune out any deeper or equal levels, then push this node
        stack = [(n, d) for (n, d) in stack if d < dash_cnt]
        stack.append((node, dash_cnt))
        continue

    # Case C: no dashes but not a 4-digit code → footnote or continuation
    #    Attach to the very last node in stack
    if stack:
        stack[-1][0].children.append(node)
        stack.append((node, 0))
    else:
        # absolutely no context? treat as root
        roots.append(node)
        stack = [(node, 0)]

# 4) Pretty‐print
def _print(node, prefix="", is_last=True):
    conn = "└─ " if is_last else "├─ "
    print(prefix + conn + repr(node))
    child_prefix = prefix + ("    " if is_last else "│   ")
    for i, c in enumerate(node.children):
        _print(c, child_prefix, i == len(node.children) - 1)

def print_forest(roots):
    for ri, root in enumerate(roots):
        # top‐level
        print(repr(root))
        for i, c in enumerate(root.children):
            _print(c, "", i == len(root.children) - 1)

print_forest(roots)


Chapter 1 - Live Animals
0101 – LIVE HORSES, ASSES, MULES AND HINNIES
├─ Horses:
│   ├─ 01012100 – Pure-bred breeding animals
│   └─ 010129 – Other:
│       ├─ 01012910 – Horses for Polo
│       └─ 01012990 – Other
├─ 010130 – Asses:
│   ├─ 01013010 – Pure-bred breeding animals
│   ├─ 01013020 – Livestock
│   └─ 01013090 – Other
└─ 010190 – Other:
    ├─ 01019030 – Mules and hinnies as livestock
    └─ 01019090 – Other
        └─ w.e.f. 1 May 2022- BCD against tariff item 0101 21 00, the entry substituted by “Free” 
[Clause 98(b) of Finance Act 2022]
            └─ IGST on Live horses in 0101 21 00, 0101 29
[SNo 1 in Sch II of Ntfn 01-IGST/28.06.2017]
                └─ Live asses, mules and hinnies
[SNo(1) in Ntfn 02-IGST/28.06.2017]
                    └─ *Export Policy Condition 2: Restricted - All Horses are free except Kathiawari, Marwari and Manipuri breeds which are permitted for export under a Restricted Export Authorisation.


In [4]:
import json

def node_to_dict(node):
    return {
        "code":        node.code,
        "description": node.desc,
        "children":    [node_to_dict(child) for child in node.children]
    }

# `roots` is the list of your top‐level nodes (e.g. Chapter 1, then 0101, etc.)
tree = [node_to_dict(r) for r in roots]

# dump to a file, or just get the string
with open("hsn_tree.json", "w", encoding="utf-8") as f:
    json.dump(tree, f, indent=2, ensure_ascii=False)

# if you just want the JSON text in a variable:
json_text = json.dumps(tree, indent=2, ensure_ascii=False)
print(json_text)

[
  {
    "code": "",
    "description": "Chapter 1 - Live Animals",
    "children": []
  },
  {
    "code": "0101",
    "description": "LIVE HORSES, ASSES, MULES AND HINNIES",
    "children": [
      {
        "code": "",
        "description": "Horses:",
        "children": [
          {
            "code": "01012100",
            "description": "Pure-bred breeding animals",
            "children": []
          },
          {
            "code": "010129",
            "description": "Other:",
            "children": [
              {
                "code": "01012910",
                "description": "Horses for Polo",
                "children": []
              },
              {
                "code": "01012990",
                "description": "Other",
                "children": []
              }
            ]
          }
        ]
      },
      {
        "code": "010130",
        "description": "Asses:",
        "children": [
          {
            "code": "01013010",
        

In [5]:
import pandas as pd
import json
import os

# 1) Load your sheet
df = (
    pd.read_excel("test.xlsx", engine="openpyxl", dtype=str)
      .fillna("")       # NaN → ""
)

# 2) Node class, with a notes list
class Node:
    def __init__(self, code, desc):
        self.code     = code
        self.desc     = desc
        self.children = []
        self.notes    = []   # collect notes here
    def __repr__(self):
        return f"{self.code + ' – ' if self.code else ''}{self.desc}"

# 3) Build tree, attaching notes under the current chapter
roots = []
stack = []

for _, row in df.iterrows():
    raw_code = row["HS Code"].replace(" ", "")
    lvl_str  = row["Level"].strip()
    desc      = row["Item Description"].strip()
    dash_cnt  = len(lvl_str)

    # Note row? (no code, no dashes)
    if not raw_code and dash_cnt == 0:
        # attach to the last chapter in roots
        if roots:
            roots[-1].notes.append(desc)
        continue

    node = Node(raw_code, desc)

    # Chapter row? (4-digit + no dashes)
    if dash_cnt == 0 and len(raw_code) == 4:
        roots.append(node)
        stack = [(node, 0)]
        continue

    # Otherwise it’s a tariff item under the last smaller dash_cnt
    for parent, pd in reversed(stack):
        if pd < dash_cnt:
            parent.children.append(node)
            break
    else:
        # fallback into last chapter
        roots[-1].children.append(node)

    # update stack
    stack = [(n, d) for (n, d) in stack if d < dash_cnt]
    stack.append((node, dash_cnt))

# 4) Serialize to JSON, including notes under each chapter
def node_to_dict(node):
    d = {
        "code":        node.code,
        "description": node.desc,
        "children":    [node_to_dict(c) for c in node.children]
    }
    if node.notes:
        d["notes"] = node.notes
    return d

# build final tree
tree = [node_to_dict(r) for r in roots]

# 5) Write it out
out_dir = os.getcwd()
with open("hsn_tree_with_notes.json", "w", encoding="utf-8") as f:
    json.dump(tree, f, indent=2, ensure_ascii=False)

print(f"Written hsn_tree_with_notes.json to {out_dir}")


Written hsn_tree_with_notes.json to /home/nithin/work/cargoa/hsn_agent/Scripts


In [6]:
import pandas as pd
import json
import os

# 1) Load the sheet and replace NaNs
df = pd.read_excel("test.xlsx", engine="openpyxl", dtype=str).fillna("")

# 2) Extract the topmost chapter heading from the first row
top_heading = df.loc[0, "Item Description"].strip()
df = df.iloc[1:].reset_index(drop=True)  # remaining rows

# 3) Define a Node class (with notes)
class Node:
    def __init__(self, code, desc):
        self.code     = code
        self.desc     = desc
        self.children = []
        self.notes    = []
    def __repr__(self):
        return f"{self.code + ' – ' if self.code else ''}{self.desc}"

# 4) Initialize the top node and stack
top_node = Node("", top_heading)
stack = [(top_node, -1)]
current_chapter = None

# 5) Build the tree
for _, row in df.iterrows():
    raw_code = row["HS Code"].replace(" ", "")
    lvl_str  = row["Level"].strip()
    desc      = row["Item Description"].strip()
    dash_cnt  = len(lvl_str)

    # a) Note row: no code & no dashes → attach to current chapter
    if not raw_code and dash_cnt == 0:
        if current_chapter:
            current_chapter.notes.append(desc)
        continue

    node = Node(raw_code, desc)

    # b) Chapter row: 4-digit + no dashes → child of top_node
    if dash_cnt == 0 and len(raw_code) == 4:
        top_node.children.append(node)
        current_chapter = node
        stack = [(top_node, -1), (node, 0)]
        continue

    # c) Tariff item: attach under nearest parent with fewer dashes
    for parent, pd in reversed(stack):
        if pd < dash_cnt:
            parent.children.append(node)
            break
    else:
        current_chapter.children.append(node)

    # d) Update stack to reflect this new node
    stack = [(n, d) for (n, d) in stack if d < dash_cnt]
    stack.append((node, dash_cnt))

# 6) Convert the Node tree to plain dicts (including notes)
def node_to_dict(node):
    data = {
        "code":        node.code,
        "description": node.desc,
        "children":    [node_to_dict(c) for c in node.children]
    }
    if node.notes:
        data["notes"] = node.notes
    return data

tree = node_to_dict(top_node)

# 7) Write out the JSON
out_path = os.path.join(os.getcwd(), "hsn_tree_with_chapter.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(tree, f, indent=2, ensure_ascii=False)

print(f"Written hierarchical JSON to: {out_path}")


Written hierarchical JSON to: /home/nithin/work/cargoa/hsn_agent/Scripts/hsn_tree_with_chapter.json
