In [1]:
import json

with open("htsdata.json") as f:
    data = json.load(f)

chunks = []
parent_stack = []

for item in data:
    htsno = item.get("htsno", "").strip()
    desc = item.get("description", "").strip()
    indent = int(item.get("indent", 0))

    if not desc or desc.lower() == "other":
        continue

    # Maintain hierarchy stack
    while len(parent_stack) > indent:
        parent_stack.pop()

    parent_stack = parent_stack[:indent]
    parent_stack.append(desc)

    # Skip chunks without htsno (they're just category headers)
    if not htsno:
        continue

    # Create combined text
    full_desc = " → ".join(parent_stack)
    chunk = f"HTS Code: {htsno}\nDescription: {full_desc}"
    chunks.append(chunk)

# Save as a JSON list of chunks
with open("rag_hts_chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)

print(f"✅ Created {len(chunks)} RAG-ready chunks.")


✅ Created 84 RAG-ready chunks.


In [3]:
import os
import json

data_folder = "Data"
chunks = []

# Loop through all files in the folder
for filename in os.listdir(data_folder):
    if filename.startswith("htsdata") and filename.endswith(".json"):
        file_path = os.path.join(data_folder, filename)
        
        with open(file_path) as f:
            data = json.load(f)

        parent_stack = []

        for item in data:
            htsno = item.get("htsno", "").strip()
            desc = item.get("description", "").strip()
            indent = int(item.get("indent", 0))

            if not desc or desc.lower() == "other":
                continue

            # Maintain hierarchy stack
            while len(parent_stack) > indent:
                parent_stack.pop()

            parent_stack = parent_stack[:indent]
            parent_stack.append(desc)

            # Skip chunks without htsno
            if not htsno:
                continue

            # Combine parent + current
            full_desc = " → ".join(parent_stack)
            chunk = f"HTS Code: {htsno}\nDescription: {full_desc}"
            chunks.append(chunk)

print(f"✅ Processed {len(chunks)} total chunks from all files.")

# Save combined output
with open("rag_hts_chunks_all.json", "w") as f:
    json.dump(chunks, f, indent=2)


✅ Processed 24520 total chunks from all files.


In [4]:
import json
import re

# Load your previously processed file
with open("rag_hts_chunks_all.json", "r") as f:
    chunks = json.load(f)

cleaned_chunks = []

for entry in chunks:
    if not entry.startswith("HTS Code:"):
        continue

    try:
        hts_line, desc_line = entry.split("\n", 1)
        hts_code = hts_line.replace("HTS Code:", "").strip()
        description = desc_line.replace("Description:", "").strip()

        # Replace Unicode arrows or other symbols with commas
        clean_desc = re.sub(r"[\u2192:]+", ",", description)
        clean_desc = re.sub(r"\s*,\s*", ", ", clean_desc)  # normalize comma spacing
        clean_desc = re.sub(r"\s{2,}", " ", clean_desc)    # collapse extra spaces
        clean_desc = clean_desc.strip(", ").capitalize()

        cleaned_chunks.append({
            "hts_code": hts_code,
            "description": clean_desc
        })

    except Exception as e:
        print(f"Skipping malformed entry: {entry[:50]}...")

# Save cleaned result
with open("cleaned_rag_hts_data.json", "w") as f:
    json.dump(cleaned_chunks, f, indent=2)

print(f"✅ Cleaned {len(cleaned_chunks)} entries saved to 'cleaned_rag_hts_data.json'")


✅ Cleaned 24520 entries saved to 'cleaned_rag_hts_data.json'


In [5]:
import json
import re

# Load your semi-cleaned file
with open("cleaned_rag_hts_data.json", "r") as f:
    data = json.load(f)

final_data = []

for entry in data:
    hts = entry["hts_code"]
    desc = entry["description"]

    # Remove duplicate commas and spaces
    desc = re.sub(r",\s*,", ",", desc)           # Fix ", ,"
    desc = re.sub(r"\s+", " ", desc)             # Collapse multiple spaces
    desc = re.sub(r"\s*,\s*", ", ", desc)        # Normalize comma spacing
    desc = desc.strip(" ,")

    # Optional: lowercase everything for uniformity
    desc = desc.lower()

    final_data.append({
        "hts_code": hts,
        "description": desc
    })

# Save final version
with open("final_rag_hts_data.json", "w") as f:
    json.dump(final_data, f, indent=2)

print(f"✅ Final cleaned {len(final_data)} entries saved to 'final_rag_hts_data.json'")


✅ Final cleaned 24520 entries saved to 'final_rag_hts_data.json'
