## Installing libraries

In [1]:
%pip install chromadb
%pip install -U sentence-transformers 
%pip install streamlit

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import boto3
import chromadb
import numpy as np
import pickle
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

## Data Preparation

- ### Corpus: IRS Internal Revenue Manual (IRM) — Part 1, Section 1.1.6 “Chief Counsel”
- ### Access Method: Web scraping from IRS.gov (HTML pages)
- ### Goal: Extract structured, text-rich content for use in the Retrieval-Augmented Generation (RAG) chatbot.
The crawled texts are parsed to extract the sections of this page. Each section such as 1.1.6.1 and subsection such as 1.1.6.1.1 are considered as one document (chunk) and are stored in a json file. Final output is saved in this format for each chunk:
```
{
  "section": "1.1.6.4",
  "date": "12-16-2020",
  "title": "Executive Counsel",
  "text": "The Executive Counsel to the Chief Counsel..."
}
```

In [3]:
URL = "https://www.irs.gov/irm/part1/irm_01-001-006"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

In [4]:
def crawl_page(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Use main-content div that is confirmed from debug
        content_div = soup.find("div", id="main-content")
        text = content_div.get_text(separator="\n", strip=True) if content_div else ""

        return {
            "url": url,
            "text": text
        }
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
        return None

In [5]:
def split_sections(text):
    lines = text.splitlines()
    sections = []
    current = None

    for i in range(len(lines)):
        line = lines[i].strip()

        # Start of a new section
        if line.startswith("1.1.6.") and lines[i+1].strip().startswith('('):
            if current:
                # Save previous section
                current["text"] = "\n".join(current["text"]).strip()
                sections.append(current)

            section = line
            date = lines[i + 1].strip().replace("(", "").replace(")", "") if i + 1 < len(lines) else ""
            title = lines[i + 2].strip() if i + 2 < len(lines) else ""

            current = {
                "section": section,
                "date": date,
                "title": title,
                "text": [section]
            }

            # Skip next two lines (date and title)
            i += 2

        elif current:
            current["text"].append(line)

    # Add the last section
    if current:
        current["text"] = "\n".join(current["text"]).strip()
        sections.append(current)

    return sections

In [6]:
def filter_sections(sections, min_length=200):
    """
    Remove sections where the main text is too short.
    """
    return [sec for sec in sections if len(sec["text"]) >= min_length]

In [7]:
def save_sections_to_json(sections, filename="sections.json", folder="data"):
    # Create folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Full path to file
    file_path = os.path.join(folder, filename)

    # Write to JSON
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(sections)} sections to {file_path}")

In [9]:
doc = crawl_page(URL)
all_sections = split_sections(doc['text'])
sections = filter_sections(all_sections)

save_sections_to_json(sections)

Saved 26 sections to data/sections.json


### Store Data on S3

In [10]:
s3 = boto3.client('s3')
bucket = "irs-rag-chatbot-dev"
local_path = "data/sections.json"
key = "irs-rag/sections.json"

s3.upload_file(local_path, bucket, key)
print(f"Uploaded to s3://{bucket}/{key}")

Uploaded to s3://irs-rag-chatbot-dev/irs-rag/sections.json


In [11]:
response = s3.get_object(Bucket=bucket, Key=key)
sections = json.loads(response['Body'].read().decode('utf-8'))

In [12]:
texts = [s['text'] for s in sections]
metadatas = [{"section": s["section"], "title": s["title"], "date": s["date"]} for s in sections]

## Embedding
- ### Generate dense vector embeddings for each document chunk using a Hugging Face model.
- ### These embeddings are later stored in a vector DB for semantic search in the RAG pipeline.

In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

## Vector DB
- ### Initialize and populate ChromaDB as the vector store to enable fast similarity search over embedded document chunks.
- ### The model takes as input the top-k most relevant document chunks retrieved from the vector store. I have selected k=3

In [14]:
# Initialize Chroma client
client = chromadb.Client()

# Optional: use an in-memory collection or persist to disk
collection = client.create_collection(name="irs_sections")

# Insert documents
for i, (text, meta, embedding) in enumerate(zip(texts, metadatas, embeddings)):
    collection.add(
        documents=[text],
        embeddings=[embedding],
        metadatas=[meta],
        ids=[str(i)]
    )

In [85]:
def query_chromadb(user_query, model, collection, top_k=3):
    query_embedding = model.encode(user_query)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    documents = results["documents"][0]
    metadatas = results["metadatas"][0]

    return documents, metadatas

## Answer Generation
- ### A natural language answer is generated using Amazon Bedrock's Titan Text Premier model.
- ### Along with the user's query, the context form vector DB and query are formatted into a prompt, which the LLM uses to synthesize a grounded, human-readable response.
- ### The output includes citations or references to the original source chunks.

In [16]:
bedrock = boto3.client("bedrock-runtime", region_name="us-east-1")

In [87]:
def estimate_token_count(text):
    # Approx: 1 token ~ 4 characters (common for LLMs)
    return int(len(text) / 4)


def generate_answer_titan(context, user_query):
    prompt = f"""You are an assistant helping answer questions using the IRS Internal Revenue Manual.

Use the context below to answer the question.
Each document in the context below starts with a section number.
If that section is used to answer a prompt, include inline section references like [1.1.6.4].
If there are multiple sections, include all of them.
Only use the sections from the context below for referencing.
Do not use generic citations like [1], [2], etc.
Return the answer with inline citations. 

Context:
{context}

Question: {user_query}

Answer:"""

    start_time = time.time()

    body = {
        "inputText": prompt,
        "textGenerationConfig": {
            "maxTokenCount": 2000,
            "temperature": 0.5,
            "topP": 0.5,
            "stopSequences": []
        }
    }

    response = bedrock.invoke_model(
        modelId="amazon.titan-text-premier-v1:0",
        body=json.dumps(body),
        contentType="application/json"
    )

    latency = time.time() - start_time

    result = json.loads(response["body"].read())
    output = result["results"][0]["outputText"]

    prompt_tokens = estimate_token_count(prompt)
    output_tokens = estimate_token_count(output)

    return output, {
        "latency": f"{round(latency, 2)} (s)",
        "prompt_tokens": prompt_tokens,
        "output_tokens": output_tokens
    }

## Examples

In [57]:
def test_query_response(user_query, model, collection):
    """
    Run a test query through the full RAG pipeline using ChromaDB and Titan Text.
    
    Args:
        user_query (str): The question to ask the chatbot.
        model: The Bedrock LLM client or callable used to generate the answer.
        collection: The Chroma collection used for document retrieval.
    
    Returns:
        None. Prints the answer and supporting citations.
    """

    # Step 1: Retrieve documents from Chroma
    context, citations = query_chromadb(user_query, model, collection)

    # Step 2: Generate answer using Titan Text
    answer, other = generate_answer_titan(context, user_query)

    # Step 3: Display results
    print("\nAnswer:\n")
    print(answer, "\n")
    for inf in other:
        print (f"{inf}: {other[inf]}")
    print("\nCitations:\n")
    for i, ref in enumerate(citations, 1):
        print(f"[{i}] Section {ref['section']}: {ref['title']}")

### Example 1

In [88]:
user_query = "What does the Associate Chief Counsel (Corporate) handle?"
test_query_response(user_query, model, collection)


Answer:

Corporate organizations, Transfers to controlled corporations, Distributions to shareholders, including dividends, return-of-capital and gain-producing distributions, stock redemptions, spin-offs, split-offs, split-ups, and partial and complete liquidations, Acquisitive and divisive corporate reorganizations, Corporate debt vs. equity, Carryovers and carrybacks of losses, credits and other corporate tax attributes, Bankruptcies, insolvency proceedings, and other debt restructurings involving corporations, Constructive ownership of corporate stock, Consolidated return issues, Other issues affecting affiliated and controlled groups of corporations. [1.1.6.5] 

latency: 3.49 (s)
prompt_tokens: 1435
output_tokens: 166

Citations:

[1] Section 1.1.6.5: Associate Chief Counsel (Corporate)
[2] Section 1.1.6.12: Associate Chief Counsel (Passthroughs & Special Industries)
[3] Section 1.1.6.13: Associate Chief Counsel (Procedure & Administration)


### Example 2

In [89]:
user_query = "What are the responsibilities of the Chief Counsel for the IRS?"
test_query_response(user_query, model, collection)


Answer:

The Chief Counsel for the Internal Revenue Service (IRS) provides advice to the IRS Commissioner on all matters pertaining to the interpretation, administration and enforcement of the Internal Revenue laws, represents the IRS in litigation, and provides all other legal support needed by the IRS to carry out its mission of serving America’s taxpayers. [1.1.6.1] 

latency: 1.94 (s)
prompt_tokens: 1196
output_tokens: 90

Citations:

[1] Section 1.1.6.1: Chief Counsel for the Internal Revenue Service
[2] Section 1.1.6.3: Deputy Chief Counsel (Operations)
[3] Section 1.1.6.4: Executive Counsel


### Example 3

In [90]:
user_query = "Which divisions report to the Deputy Chief Counsel (Technical)?"
test_query_response(user_query, model, collection)


Answer:

The Associate Chief Counsel (Corporate), The Associate Chief Counsel (Employee Benefits, Exempt Organizations, and Employment Taxes), The Associate Chief Counsel (Financial Institutions & Products), The Associate Chief Counsel (Income Tax & Accounting), The Associate Chief Counsel (International), and The Associate Chief Counsel (Passthroughs & Special Industries) [1.1.6.2] 

latency: 1.78 (s)
prompt_tokens: 1005
output_tokens: 94

Citations:

[1] Section 1.1.6.3: Deputy Chief Counsel (Operations)
[2] Section 1.1.6.2: Deputy Chief Counsel (Technical)
[3] Section 1.1.6.19: Division Counsel (Tax Exempt and Government Entities)


### Example 4

In [91]:
user_query = "What legal guidance does the Office of Chief Counsel provide for the IRS?"
test_query_response(user_query, model, collection)


Answer:

The Office of Chief Counsel provides legal guidance to the IRS by preparing legislative proposals, regulations, revenue rulings and procedures, actions on decisions, and other items of public guidance and legal advice. [1.1.6.1] 

latency: 1.23 (s)
prompt_tokens: 1255
output_tokens: 57

Citations:

[1] Section 1.1.6.1: Chief Counsel for the Internal Revenue Service
[2] Section 1.1.6.4: Executive Counsel
[3] Section 1.1.6.16: Division Counsel/Associate Chief Counsel (National Taxpayer Advocate Program)


### Example 5

In [92]:
user_query = "which cities the Area Counsels provide legal services for?"
test_query_response(user_query, model, collection)


Answer:

The Area Counsels provide legal services to the LB&I Division of the Internal Revenue Service Compliance Practice Areas and are located within each of five geographic offices of the Division Counsel (LB&I): Manhattan, Philadelphia, Downers Grove/Chicago, Houston/Dallas, and Oakland/San Francisco. [1.1.6.15.1] 

latency: 1.68 (s)
prompt_tokens: 799
output_tokens: 77

Citations:

[1] Section 1.1.6.14.1: Area Counsel (CT)
[2] Section 1.1.6.15.1: Area Counsel (LB&I)
[3] Section 1.1.6.17.1: Area Counsel (SB/SE)
