In [2]:

import re
import pandas as pd

In [3]:
import pandas as pd
import re

def parse_duty_value(x):
    """
    - "Rs. 42 / kg" or "₹42/kg"    -> (42.0, "INR/kg")
    - "120/kg"                     -> (120.0, "INR/kg")
    - plain number "10"            -> (0.10, "percentage")
    - numeric 10 or 10.0           -> (0.10, "percentage")
    - NaN/empty                    -> (None, None)
    """
    if pd.isna(x):
        return None, None

    # Numeric → percentage
    if isinstance(x, (int, float)):
        return float(x) / 100.0, "percentage"

    s = str(x).strip()

    # 1) Currency-per-unit pattern
    m = re.match(r'(?:Rs\.?|₹)?\s*([\d\.]+)\s*/\s*(\w+)', s, flags=re.IGNORECASE)
    if m:
        value = float(m.group(1))
        unit  = m.group(2)
        return value, f"INR/{unit}"

    # 2) Pure number → percentage
    if re.fullmatch(r'[\d\.]+', s):
        return float(s) / 100.0, "percentage"

    return None, None

def format_duty(value, unit):
    if value is None or pd.isna(value):
        return ""
    if unit == "percentage":
        return f"{value * 100:.2f}%"
    else:
        # e.g. unit == "INR/kg"
        return f"{value:.2f} {unit}"

def normalize_and_prepare_display(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy().applymap(lambda x: x.strip() if isinstance(x, str) else x)
    duty_cols = [
        'Basic Duty (SCH)',
        'Basic Duty (NTFN)',
        'Specific Duty (Rs)',
        'IGST',
        '10% SWS',
        'Total duty with SWS of 10% on BCD',
        'Total Duty Specific',
        'Pref. Duty (A)',
    ]

    for col in duty_cols:
        if col not in df:
            continue
        parsed = df[col].apply(parse_duty_value)
        df[f'{col}_value'], df[f'{col}_unit'] = zip(*parsed)
        df[f'{col}_display'] = [
            format_duty(v, u) for v, u in zip(df[f'{col}_value'], df[f'{col}_unit'])
        ]

    return df

# Example:
# df_raw = pd.read_excel('chapter.xlsx')
# df_out = normalize_and_prepare_display(df_raw)
# df_out[['IGST_display','Specific Duty (Rs)_display']]
#   might show: ["10.00%",   "42.00 INR/kg"]


In [4]:


def extract_policy_links(cell: str):
    """
    From a string like "Restricted*1,2" or "Free*" or "Conditional2":
      - returns ("Restricted", ["*","1","2"])
      - returns ("Free", ["*"])
      - returns ("Conditional", ["2"])
    If no trailing markers, returns (original_text, []).
    """
    if pd.isna(cell) or not isinstance(cell, str):
        return None, []

    s = cell.strip()
    # split off trailing sequence of *, digits and commas
    m = re.match(r'^(?P<text>.*?)(?P<refs>[\*\d,]+)$', s)
    if not m:
        return s, []

    base, refs = m.group("text").strip(), m.group("refs")
    # normalize refs into list of individual tokens
    tokens = []
    for part in refs.split(','):
        part = part.strip()
        if not part:
            continue
        # if it's multiple asterisks, keep each
        if set(part) == {"*"}:
            tokens += ["*"] * len(part)
        else:
            # each character that is a digit
            tokens += list(part)
    return base, tokens

def attach_policy_links(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for col in ("Import Policy", "Export Policy"):
        if col not in df:
            continue

        extracted = df[col].apply(extract_policy_links)
        df[f"{col}_text"], df[f"{col}_note_refs"] = zip(*extracted)

    return df




In [8]:
def compute_level(row):
    lvl_cell = row["Level"]
    desc     = row["Item Description"]
    # 1) Explicitly detect chapter rows by Description text
    if isinstance(desc, str) and desc.strip().lower().startswith("chapter"):
        return 0
    # 2) Otherwise, if Level is hyphens, count them
    if isinstance(lvl_cell, str) and re.fullmatch(r"-+", lvl_cell.strip()):
        return len(lvl_cell.strip())
    # 3) Fallback (if you have other non-chapter, non-hyphen rows):
    return None   # or you could default to a sensible number

In [9]:
def main_excel_read(input_file: str):
    # Read the Excel file
    df = pd.read_excel(input_file)
    df.columns = (
    df.columns
      .str.strip()                      # remove leading/trailing spaces
      .str.replace(r"\s+", " ", regex=True)  # collapse inner whitespace
    )

    # Normalize and prepare display
    df = normalize_and_prepare_display(df)
    # Attach policy links
    df = attach_policy_links(df) 
    df["level"] = df.apply(compute_level, axis=1)
    df.to_excel(f"{input_file}_with_links.xlsx", index=False)

In [None]:
main_excel_read("Chapter .xlsx")

  df = df.copy().applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [None]:
# Hierarchical JSON Generator (Jupyter Notebook)
import json
import pandas as pd
import re 


def build_hierarchy(df, level_col='level', remarks_col='Remark'):
    """
    Construct a nested hierarchy based on the numeric Level column.
    Rows where Level is defined become nodes; rows where Level is blank are treated as notes.

    Each node contains all original columns plus `notes` and `children` lists.
    """
    # Strip whitespace from headers
    df = df.rename(columns=lambda c: c.strip())

    root = []
    level_nodes = {}
    last_node = None

    for _, row in df.iterrows():
        # Parse level
        lvl_raw = row.get(level_col)
        level = None
        if pd.notna(lvl_raw):
            try:
                level = int(lvl_raw)
            except ValueError:
                level = None

        remark = row.get(remarks_col)

        # Note row: attach to most recent node
        if level is None:
            if pd.notna(remark) and last_node is not None:
                last_node['notes'].append(str(remark).strip())
            continue

        # Node row: copy all columns
        node = {col: row[col] for col in df.columns}
        node['notes'] = []
        node['children'] = []

        if level == 0:
            root.append(node)
        else:
            parent = level_nodes.get(level - 1)
            if parent is not None:
                parent['children'].append(node)
            else:
                # fallback if no parent found
                root.append(node)

        # Track current node
        level_nodes[level] = node
        last_node = node

    return root

# --- Usage in Jupyter ---
# 1. Set your input and output file names:
input_path = 'Chapter 25.xlsx_with_links.xlsx'   # e.g. 'Chapter25.xlsx'
output_path = 'Chapter25_hierarchy.json'  # e.g. 'hierarchy.json'

# 2. Read Excel and build hierarchy
df = pd.read_excel(input_path)
hierarchy = build_hierarchy(df)

# 3. Write to JSON file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(hierarchy, f, indent=2, ensure_ascii=False)

print(f"Hierarchical JSON generated and saved to '{output_path}'.")



Hierarchical JSON generated and saved to 'Chapter25_hierarchy.json'.
