<a href="https://colab.research.google.com/github/mukul-mschauhan/GenerativeAI/blob/main/Tax_Audit_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Core Issue

Business Problem

Tax auditors and regulatory authorities must continuously interpret and apply a large, evolving body of tax laws, regulations, and official guidance.

Today, this work is largely manual and fragmented:

* Information is spread across acts, rules, circulars, FAQs, and amendments

* Regulatory updates occur frequently and asynchronously

* Research is time-consuming and difficult to scale

* High risk of missed updates, inconsistent interpretation, and rework

* Significant effort spent on finding information instead of analyzing it

Core Issue

Tax audit research is slow, manual, and inconsistent in an environment of rapidly changing regulations.

In [1]:
!pip -q install -U \
  gradio==4.44.0 \
  langchain==0.2.16 \
  langchain-community==0.2.16 \
  langchain-openai==0.1.23 \
  openai==1.42.0 \
  diskcache==5.6.3\
  langchain-tavily\
  langgraph

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Cannot install langchain-community==0.2.16, langchain-tavily==0.1.5, langchain-tavily==0.1.6, langchain-tavily==0.2.0, langchain-tavily==0.2.1, langchain-tavily==0.2.10, langchain-tavily==0.2.11, langchain-tavily==0.2.12, langchain-tavily==0.2.13, langchain-tavily==0.2.14, langchain-tavily==0.2.15, langchain-tavily==0.2.16, langchain-tavily==0.2.2, langchain-tavily==0.2.3, langchain-tavily==0.2.4, langchain-tavily==0.2.5, langchain-tavily==0.2.6, langchain-tavily==0.2.7, langchain-tavily==0.2.8, langchain-tavily==0.2.9 and langchain==0.2.16 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-depe

In [11]:
import os
from google.colab import userdata
openai_api_key = userdata.get('OPENAI_API_KEY')

OPENAI_BASE_URL = "https://aibe.mygreatlearning.com/openai/v1"

os.environ["OPENAI_API_KEY"] = openai_api_key

os.environ["TAVILY_API_KEY"] = userdata.get('TAVILY_API_KEY')

### Why there is need of Disk Cache?

We use diskcache to avoid repeating the same expensive operations—like calling Tavily search and the OpenAI model—when the user runs similar queries multiple times during a session or after a notebook restart. It stores results on disk, so the app becomes faster, less expensive, and more stable, reduces API calls and rate-limit errors, and keeps the demo smooth in Colab where cells may be re-run frequently. That said, it’s not mandatory for correctness—if simplicity is the goal, an in-memory cache can be used instead.

In [3]:
# Diskcache Installation
!pip -q install -U diskcache
import sys, subprocess, pkgutil

print("diskcache found:", pkgutil.find_loader("diskcache") is not None)
!pip show diskcache

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

  print("diskcache found:", pkgutil.find_loader("diskcache") is not None)


diskcache found: True
Name: diskcache
Version: 5.6.3
Summary: Disk Cache -- Disk and file backed persistent cache.
Home-page: http://www.grantjenks.com/docs/diskcache/
Author: Grant Jenks
Author-email: contact@grantjenks.com
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: 
Required-by: 


In [4]:
!pip install -qU langchain-tavily langchain-openai langchain-community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.8/84.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.1/489.1 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conf

# Core configuration + guardrails

In [30]:
import time, re, hashlib
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from diskcache import Cache

# LangChain Tavily tool (mandatory)
from langchain_tavily import TavilySearch


# LLM
from langchain_openai import ChatOpenAI

CACHE_DIR = "/content/tax_agent_cache"
cache = Cache(CACHE_DIR)

#DEFAULT_BASE_URL = "https://api.openai.com/v1"

def now_utc():
    return time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime())

# Preferred authority domains (heuristic scoring)
PREFERRED_DOMAINS = {
    "UAE": ["tax.gov.ae", "u.ae", "mof.gov.ae", "uaecabinet.ae", "adaa.gov.ae"],
    "India": ["incometax.gov.in", "cbic.gov.in", "gst.gov.in", "indiacode.nic.in", "egazette.nic.in"],
    "US": ["irs.gov", "treasury.gov", "govinfo.gov", "ecfr.gov"],
    "UK": ["gov.uk", "hmrc.gov.uk", "legislation.gov.uk"],
    "Other": []}

LOW_TRUST_HINTS = ["medium.com", "wordpress", "substack", "quora.com", "reddit.com"]

DISCLAIMER = "⚠️ **This is NOT legal or tax advice.** Verify with official authority publications."

# Responsible AI policy (what this assistant MUST do)
RESPONSIBLE_AI_BANNER = (
    "⚠️ **Disclaimer:** This is **NOT legal or tax advice**. "
    "Use this output for audit support only and verify with official authority publications.\n"
)

DISALLOWED = [
    "step-by-step filing instructions",
    "tax planning / minimization / avoidance strategies",
    "loophole exploitation guidance",
    "fabricating laws, sections, penalties, or dates",
]

STRICT_OUTPUT_FORMAT = """
Return the report with these exact sections, in order:

A) Applicable Sources
B) Key Provisions & Obligations
C) Exemptions & Thresholds
D) Penalties & Compliance Risks
E) Audit Checklist
F) Assumptions & Interpretation Limits
G) Citations"""

def domain_of(url: str) -> str:
    m = re.search(r"https?://([^/]+)/?", url)
    return (m.group(1).lower() if m else "").replace("www.", "")

def is_preferred(jurisdiction: str, dom: str) -> bool:
    prefs = PREFERRED_DOMAINS.get(jurisdiction, [])
    return any(dom.endswith(p) for p in prefs) or dom.endswith(".gov") or dom.endswith(".gov.uk")

def low_trust(dom: str) -> bool:
    return any(x in dom for x in LOW_TRUST_HINTS)

def authority_hint(jurisdiction: str, url: str) -> str:
    dom = domain_of(url)
    if is_preferred(jurisdiction, dom):
        return "Official / Government / Tax Authority (preferred)"
    if low_trust(dom):
        return "Low-trust web source (downgraded)"
    return "General web source (use with caution)"

def get_llm() -> ChatOpenAI:
    base_url = os.getenv("OPENAI_BASE_URL") or DEFAULT_BASE_URL
    return ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0.1,          # stable + audit-friendly
        base_url=base_url,
        timeout=45,
        max_retries=2
    )

@dataclass
class WebHit:
    title: str
    url: str
    snippet: str
    domain: str
    authority: str


# Tavily search + ranking + caching

In [36]:
def run_search(jurisdiction: str, query: str, max_results: int):
    if not query or len(query.strip()) < 10:
        raise ValueError("Please enter a more detailed audit scenario (>=10 characters).")
    if not jurisdiction:
        raise ValueError("Please select a jurisdiction.")

    local_tool = TavilySearch(max_results=int(max_results))
    payload = local_tool.invoke({"query": f"{jurisdiction} tax law official guidance {query}"})

    results = (payload.get("results", []) or [])

    # Rank: govt/official first, then other sites; within each, rank by Tavily score
    def sort_key(r):
        url = r.get("url", "")
        dom = domain_of(url)

        official_bucket = 0 if is_preferred(jurisdiction, dom) else 1   # 0 = official first
        low_trust_penalty = 1 if low_trust(dom) else 0                  # push low-trust later
        tavily_score = float(r.get("score") or 0.0)                     # higher is better

        return (official_bucket, low_trust_penalty, -tavily_score)

    results = sorted(results, key=sort_key)

    insufficient_authority = (jurisdiction in ["UAE", "India", "US", "UK"]) and not any(
        is_preferred(jurisdiction, domain_of(r.get("url", ""))) for r in results[:5]
    )

    return results, insufficient_authority

In [37]:
def build_report(jurisdiction: str, query: str, company_context: str, strictness: float, results: list, insufficient_authority: bool):
    strict_mode = "Conservative" if strictness < 0.5 else "Broad (still evidence-based)"

    # Build evidence block from Tavily snippets only
    sources_block = []
    evidence_block = []
    for i, r in enumerate(results, start=1):
        url = r.get("url","")
        title = r.get("title") or url
        content = (r.get("content") or "").strip()
        sources_block.append(
            f"[{i}] {title}\nURL: {url}\nAuthority: {authority_hint(jurisdiction, url)}\nDate: Not found in snippet\n"
        )
        evidence_block.append(f"Source [{i}] snippet:\n{content}")

    prompt = f"""
You are an AI Tax Research & Audit Support Assistant for auditors/regulators.

Start your answer with:
"{DISCLAIMER}"
Also include: "Last verified on {now_utc()}"

User Inputs:
- Jurisdiction: {jurisdiction}
- Strictness: {strict_mode}
- Query/Audit Scenario: {query}
- Company Context: {company_context}

NON-NEGOTIABLE GUARDRAILS:
1) Use ONLY the provided snippets as evidence.
2) Do NOT invent section numbers, thresholds, penalties, dates, or authority statements.
3) If the snippets do not contain enough detail, explicitly say "Not specified in snippet" or
   "Insufficient authoritative guidance found" (especially if insufficient_authority=True).
4) Do NOT provide filing instructions or tax planning/avoidance strategies.

insufficient_authority = {insufficient_authority}

Sources (for citations):
{chr(10).join(sources_block)}

Evidence (snippets only):
{chr(10)+chr(10)}{(chr(10)+chr(10)).join(evidence_block)}

Return EXACTLY in this format:
{STRICT_OUTPUT_FORMAT}

Use inline citations [1], [2] across sections B–F and list them in G) Citations.
In F) clearly state limitation: this report is based on search snippets (no full-text retrieval).
""".strip()

    return llm.invoke(prompt).content

# Gradio App (Research + Generate Summary + Export MD)

In [42]:
import gradio as gr
import traceback

def ui_research(jurisdiction, query, company_context, strictness, max_sources):
    results, insufficient = run_search(jurisdiction, query, int(max_sources))

    preview_lines = [f"{DISCLAIMER}\n", f"Last verified: {now_utc_str()}\n"]
    if insufficient:
        preview_lines.append("⚠️ Insufficient authoritative guidance found in top sources; output will be cautious.\n")

    for i, r in enumerate(results, start=1):
        content = r.get("content", "") or ""
        preview_lines.append(
            f"**[{i}] {r.get('title','')}**\n"
            f"- {authority_hint(jurisdiction, r.get('url',''))}\n"
            f"- {r.get('url','')}\n"
            f"- Snippet: {(content[:240] + '…') if len(content)>240 else content}\n"
        )

    state = {
        "jurisdiction": jurisdiction,
        "query": query,
        "company_context": company_context or "",
        "strictness": float(strictness),
        "results": results,
        "insufficient": insufficient
    }

    # Enable generate button only after successful research
    btn_update = gr.update(interactive=bool(results))

    return state, "\n\n".join(preview_lines), "", "✅ Research completed. Now click **Generate Audit Summary**.", "", btn_update


def ui_generate(state):
    try:
        if not state or not state.get("results"):
            return "⚠️ No research data found. Please click **Research** first.", "⚠️ Missing state/results.", ""

        status = "⏳ Generating audit summary from Tavily snippets…"

        summary = build_report(
            jurisdiction=state["jurisdiction"],
            query=state["query"],
            company_context=state["company_context"],
            strictness=state["strictness"],
            results=state["results"],
            insufficient_authority=state["insufficient"]
        )

        return summary, "✅ Summary generated.", ""  # report, status, error

    except Exception:
        return "", "❌ Failed to generate summary.", "```text\n" + traceback.format_exc() + "\n```"


with gr.Blocks(title="AI Tax Research & Audit Support Assistant") as demo:
    gr.Markdown("## AI Tax Research & Audit Support Assistant (LangChain Tavily + Guardrails)")
    gr.Markdown(DISCLAIMER)

    st = gr.State({})

    with gr.Row():
        jurisdiction = gr.Dropdown(["UAE","India","US","UK","Other"], value="UAE", label="Jurisdiction")
        max_sources = gr.Slider(3, 10, value=6, step=1, label="Max Sources")

    query = gr.Textbox(label="Tax Query / Audit Scenario", lines=3)
    company_context = gr.Textbox(label="Optional Company Context", lines=2)
    strictness = gr.Slider(0.0, 1.0, value=0.2, step=0.1, label="Strictness")

    with gr.Row():
        btn_r = gr.Button("Research", variant="primary")
        btn_g = gr.Button("Generate Audit Summary", interactive=False)

    preview = gr.Markdown(label="Research Preview")
    report = gr.Markdown(label="Audit Summary")

    status = gr.Markdown(label="Status")
    error_box = gr.Markdown(label="Errors (if any)")

    # Research enables Generate (btn_g is output)
    btn_r.click(
        ui_research,
        inputs=[jurisdiction, query, company_context, strictness, max_sources],
        outputs=[st, preview, report, status, error_box, btn_g]
    )

    # Generate fills the report + status + errors
    btn_g.click(
        ui_generate,
        inputs=[st],
        outputs=[report, status, error_box]
    )

demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://15dc4eaff7df8de7bc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7863 <> https://15dc4eaff7df8de7bc.gradio.live


