In [5]:
import json

with open("htsdata (85).json", "r") as f:
    data = json.load(f)


In [6]:
def build_hierarchy(data):
    stack = []
    results = []

    for item in data:
        level = int(item["indent"])
        while len(stack) > level:
            stack.pop()

        context = " > ".join([x["description"] for x in stack] + [item["description"]])

        full_chunk = {
            "htsno": item.get("htsno", ""),
            "description": context,
            "rate_general": item.get("general", ""),
            "rate_special": item.get("special", ""),
            "rate_other": item.get("other", "")
        }
        results.append(full_chunk)
        stack.append(item)

    return results

chunks = build_hierarchy(data)



In [7]:
def make_text(chunk):
    return f"""HTS Number: {chunk['htsno']}
Description: {chunk['description']}
General Rate: {chunk['rate_general']}
Special Rate: {chunk['rate_special']}
Other Rate: {chunk['rate_other']}"""

documents = [make_text(chunk) for chunk in chunks]



In [11]:
retrieved_chunks = "\n\n".join(documents[3:10])  # taking first 3 chunks for demo

user_question = "What is the duty rate for synchronous motors valued not over $4 each?"



In [12]:
prompt = f"""
You are an expert on HTS tariff data. Use the following context to answer the question.
If the answer is not found, say "Not available in the dataset."

Context:
{retrieved_chunks}

Question: {user_question}
"""


In [13]:
print(prompt)



You are an expert on HTS tariff data. Use the following context to answer the question.
If the answer is not found, say "Not available in the dataset."

Context:
HTS Number: 8501.10.20.00
Description: Electric motors and generators (excluding generating sets): > Motors of an output not exceeding 37.5 W: > Of under 18.65 W: > Synchronous, valued not over $4 each
General Rate: 6.7%
Special Rate: Free (A,AU,B,BH,CL,CO,D,E,IL,JO,KR,MA,OM,P,PA,PE,S,SG)
Other Rate: 90%

HTS Number: 8501.10.40
Description: Electric motors and generators (excluding generating sets): > Motors of an output not exceeding 37.5 W: > Of under 18.65 W: > Other
General Rate: 4.4%
Special Rate: Free (A,AU,B,BH,CL,CO,D,E,IL,JO,KR,MA,OM,P,PA,PE,S,SG)
Other Rate: 35%

HTS Number: 8501.10.40.20
Description: Electric motors and generators (excluding generating sets): > Motors of an output not exceeding 37.5 W: > Of under 18.65 W: > Other > AC
General Rate: 
Special Rate: 
Other Rate: 

HTS Number: 
Description: Electric mo

In [1]:
pip install langchain openai faiss-cpu tiktoken


Note: you may need to restart the kernel to use updated packages.


In [20]:
import json
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from PIL import Image, ImageDraw, ImageFont
import csv
from pathlib import Path

def collect_feedback(question, given_answer):
    feedback = input("\nWas this answer correct? (yes/no): ").strip().lower()
    
    if feedback == "no":
        correct_answer = input("Please provide the correct answer: ").strip()
        feedback_file = "feedback_log.csv"

        # Create CSV file with headers if it doesn't exist
        file_exists = Path(feedback_file).exists()
        with open(feedback_file, mode="a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            if not file_exists:
                writer.writerow(["Question", "Generated Answer", "Correct Answer"])
            writer.writerow([question, given_answer, correct_answer])
        
        print(f"\n✅ Feedback saved to {feedback_file}. Thank you!")


# --- Step 1: Load and Prepare HTS JSON Data ---
def load_hts_data(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    documents = []
    for entry in data:
        htsno = entry.get("htsno", "N/A")
        desc = entry.get("description", "")
        general = entry.get("general", "")
        special = entry.get("special", "")
        other = entry.get("other", "")
        
        full_text = f"HTS Number: {htsno}\nDescription: {desc}\nGeneral Rate: {general}\nSpecial Rate: {special}\nOther Rate: {other}"
        documents.append(Document(page_content=full_text))
    
    return documents

# --- Step 2: Chunk the Text ---
def split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
    return splitter.split_documents(documents)

# --- Step 3: Embed and Index with FAISS ---
def build_vector_store(chunks):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(chunks, embedding=embeddings)
    return vectorstore

# --- Step 4: Set up RAG Pipeline ---
def setup_qa_pipeline(vectorstore):
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    llm = OpenAI(temperature=0)
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    return qa

def save_proof_as_image(text, file_name="proof.png"):
    font = ImageFont.load_default()
    lines = text.split("\n")
    # create a temporary image just to calculate text width
    dummy_img = Image.new("RGB", (1, 1))
    draw = ImageDraw.Draw(dummy_img)
    width = max(draw.textlength(line, font=font) for line in lines) + 20
    height = len(lines) * 15 + 20

    img = Image.new("RGB", (int(width), height), color="white")
    d = ImageDraw.Draw(img)
    
    y = 10
    for line in lines:
        d.text((10, y), line, font=font, fill="black")
        y += 15

    img.save(file_name)
    print(f"Screenshot saved as {file_name}")

# --- Step 5: Main Execution ---
def main():
    json_path = "htsdata (85).json"
    
    print("Loading and processing HTS data...")
    documents = load_hts_data(json_path)
    chunks = split_documents(documents)
    
    print("Building FAISS index...")
    vectorstore = build_vector_store(chunks)
    
    print("Setting up QA pipeline...")
    qa_pipeline = setup_qa_pipeline(vectorstore)

    question = "What is the HTS number for printed circuits?"
    print(f"\nQuestion: {question}")
    response = qa_pipeline.invoke({"query": question})
    print("\nAnswer:", response["result"])

    # Show retrieved source chunks (proof)
    print("\n--- Source Document(s) Used ---")
    for i, doc in enumerate(response["source_documents"]):
        print(f"\n[Document {i+1}]\n{doc.page_content}")

    # 🔽 Add this part for screenshot
    proof_text = "\n".join(doc.page_content for doc in response["source_documents"])
    save_proof_as_image(proof_text)

    # Collect feedback from the user
    collect_feedback(question, response["result"])



if __name__ == "__main__":
    main()


Loading and processing HTS data...
Building FAISS index...
Setting up QA pipeline...

Question: What is the HTS number for printed circuits?

Answer:  The HTS number for printed circuits is not provided in the given context.

--- Source Document(s) Used ---

[Document 1]
HTS Number: 8514.32.10.00
Description: Of a kind used solely or principally for the manufacture of printed circuits or printed circuit assemblies
General Rate: Free
Special Rate: 
Other Rate: 35%

[Document 2]
HTS Number: 8514.31.10.00
Description: Of a kind used solely or principally for the manufacture of printed circuits or printed circuit assemblies
General Rate: Free
Special Rate: 
Other Rate: 35%

[Document 3]
HTS Number: 8514.39.10.00
Description: Of a kind used solely or principally for the manufacture of printed circuits or printed circuit assemblies
General Rate: Free
Special Rate: 
Other Rate: 35%

[Document 4]
HTS Number: 
Description: Printed circuit assemblies:
General Rate: 
Special Rate: 
Other Rate:
Sc