### Permits data Preprocessing

In [3]:
import json

with open('raw_data.json', 'r') as f:
    data = json.load(f)

<class 'dict'>
Plumbing Permit
Electrical Permit
Driveway / Sidewalks
Mechanical Permit


In [4]:

print(type(data[0]))

permit_type_desc = []
raw_data_each_type = []
for i in data:
    if i['permit_type_desc'] not in permit_type_desc:
        print(i['permit_type_desc'])
        raw_data_each_type.append(i)
        permit_type_desc.append(i['permit_type_desc'])


<class 'dict'>
Plumbing Permit
Electrical Permit
Driveway / Sidewalks
Mechanical Permit


In [5]:
import json
print(json.dumps(data[0], indent=4))

{
    "permittype": "PP",
    "permit_type_desc": "Plumbing Permit",
    "permit_number": "2025-074499 PP",
    "permit_class_mapped": "Residential",
    "permit_class": "Residential",
    "work_class": "Irrigation",
    "permit_location": "13104 GEARY DR",
    "description": "Irrigation for lawn",
    "tcad_id": "0451150901",
    "applieddate": "2025-03-07T00:00:00.000",
    "issue_date": "2025-06-18T00:00:00.000",
    "day_issued": "WEDNESDAY",
    "calendar_year_issued": "2025",
    "fiscal_year_issued": "2025",
    "issued_in_last_30_days": "No",
    "issue_method": "Permit Center",
    "status_current": "Active",
    "statusdate": "2025-07-16T00:00:00.000",
    "expiresdate": "2026-01-12T00:00:00.000",
    "original_address1": "13104 GEARY DR",
    "original_city": "AUSTIN",
    "original_state": "TX",
    "original_zip": "78652",
    "council_district": "5",
    "jurisdiction": "AUSTIN LTD",
    "link": {
        "url": "https://abc.austintexas.gov/web/permit/public-search-other?

##### Data Normalization for Permit Records
- Parses and standardizes datetime fields to UTC ISO format.
- Structures raw data into nested dictionaries for permit, address, parcel, and location details.
- Applies normalization to the first 100 records for consistent downstream use.


In [6]:
from datetime import datetime, timezone

# Normalize datetime fields to UTC ISO format
def parse_datetime(value):
    try:
        if value:
            dt = datetime.fromisoformat(value.replace("Z", "").replace(".000", "")).replace(tzinfo=timezone.utc)
            return dt.isoformat()
    except Exception:
        return None


# Normalize a single record
def normalize_record(record):
    normalized = {}

    normalized["permit"] = {
        "type_code": record.get("permittype"),
        "type_description": record.get("permit_type_desc"),
        "number": record.get("permit_number"),
        "class": record.get("permit_class"),
        "class_mapped": record.get("permit_class_mapped"),
        "work_class": record.get("work_class"),
        "description": record.get("description"),
        "status": {
            "current": record.get("status_current"),
            "date": parse_datetime(record.get("statusdate")),
            "completed": parse_datetime(record.get("completed_date")),
            "expires": parse_datetime(record.get("expiresdate")),
        },
        "dates": {
            "applied": parse_datetime(record.get("applieddate")),
            "issued": parse_datetime(record.get("issue_date")),
            "issue_day": record.get("day_issued"),
            "calendar_year": record.get("calendar_year_issued"),
            "fiscal_year": record.get("fiscal_year_issued"),
            "recent_30_days": record.get("issued_in_last_30_days"),
        },
        "method": record.get("issue_method"),
        "project_id": record.get("project_id"),
        "master_permit": record.get("masterpermitnum"),
        "link": record.get("link", {}).get("url"),
    }

    normalized["address"] = {
        "line1": record.get("original_address1") or record.get("permit_location"),
        "city": record.get("original_city"),
        "state": record.get("original_state"),
        "zip": record.get("original_zip"),
        "council_district": record.get("council_district"),
        "jurisdiction": record.get("jurisdiction"),
    }

    normalized["parcel"] = {
        "tcad_id": record.get("tcad_id"),
        "legal_description": record.get("legal_description"),
    }

    normalized["location"] = {
        "latitude": float(record.get("latitude")) if record.get("latitude") else None,
        "longitude": float(record.get("longitude")) if record.get("longitude") else None,
    }

    return normalized

# Normalize first 100 records
normalized_records = [normalize_record(r) for r in data[:100]]
print(len(normalized_records))
print(normalized_records[0])

100
{'permit': {'type_code': 'PP', 'type_description': 'Plumbing Permit', 'number': '2025-074499 PP', 'class': 'Residential', 'class_mapped': 'Residential', 'work_class': 'Irrigation', 'description': 'Irrigation for lawn', 'status': {'current': 'Active', 'date': '2025-07-16T00:00:00+00:00', 'completed': None, 'expires': '2026-01-12T00:00:00+00:00'}, 'dates': {'applied': '2025-03-07T00:00:00+00:00', 'issued': '2025-06-18T00:00:00+00:00', 'issue_day': 'WEDNESDAY', 'calendar_year': '2025', 'fiscal_year': '2025', 'recent_30_days': 'No'}, 'method': 'Permit Center', 'project_id': '13534570', 'master_permit': '13487648', 'link': 'https://abc.austintexas.gov/web/permit/public-search-other?t_detail=1&t_selected_folderrsn=13534570'}, 'address': {'line1': '13104 GEARY DR', 'city': 'AUSTIN', 'state': 'TX', 'zip': '78652', 'council_district': '5', 'jurisdiction': 'AUSTIN LTD'}, 'parcel': {'tcad_id': '0451150901', 'legal_description': None}, 'location': {'latitude': 30.11510451, 'longitude': -97.814

In [10]:
import json
print(json.dumps(normalized_records[0], indent=4))

{
    "permit": {
        "type_code": "PP",
        "type_description": "Plumbing Permit",
        "number": "2025-074499 PP",
        "class": "Residential",
        "class_mapped": "Residential",
        "work_class": "Irrigation",
        "description": "Irrigation for lawn",
        "status": {
            "current": "Active",
            "date": "2025-07-16T00:00:00+00:00",
            "completed": null,
            "expires": "2026-01-12T00:00:00+00:00"
        },
        "dates": {
            "applied": "2025-03-07T00:00:00+00:00",
            "issued": "2025-06-18T00:00:00+00:00",
            "issue_day": "WEDNESDAY",
            "calendar_year": "2025",
            "fiscal_year": "2025",
            "recent_30_days": "No"
        },
        "method": "Permit Center",
        "project_id": "13534570",
        "master_permit": "13487648",
        "link": "https://abc.austintexas.gov/web/permit/public-search-other?t_detail=1&t_selected_folderrsn=13534570"
    },
    "address": 

##### Context Generation with GPT for Semantic Search
- Uses OpenAI's GPT model to generate concise, natural-language summaries of each permit record.
- Summaries include permit type, purpose, location, key dates, and jurisdiction based on available data.
- Enriches each normalized record with a `context_summary` field for improved semantic search and embeddings.


In [None]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
import os, time
import traceback

openai_client = OpenAI(api_key=os.getenv('OPENAI_KEY'))

def context_genration(record):
    try:
        system_prompt = f"""You are a summarization assistant helping to generate a concise and meaningful description of a permit record for the purpose of semantic search and embedding.

        Your goal is to write a single-sentence natural language summary that captures the key context of the permit, including:

        - The type and classification of the permit (e.g., Plumbing, Electrical, Residential, Commercial)
        - The purpose of the work (from the description and work class fields)
        - The address or location where the permit was issued
        - Important dates such as applied date, issue date, and expiration if available
        - The jurisdiction, project ID, or district info if relevant
        - The applicant's intent (e.g., construction, irrigation, remodeling, demolition)

        Be sure to include only what is provided in the data and do not fabricate information. Write in clear and neutral tone.

        Example style:
        Residential plumbing permit issued in March 2025 for lawn irrigation at 13104 GEARY DR, Austin, under jurisdiction AUSTIN LTD, expiring in January 2026.

        Now generate a similar summary for this record:
        {json.dumps(record, indent=2)}
        """

        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": system_prompt}
            ]
        )

        summary = response.choices[0].message.content
        record["context_summary"] = summary
        return record
    except:
        traceback.print_exc()


# Process the records
enriched_records = []
for i, record in enumerate(normalized_records):
    print(f"Processing record {i+1}/{len(normalized_records)}...")
    enriched = context_genration(record)
    if enriched:
        enriched_records.append(enriched)
    time.sleep(0.5)  # Be kind to the API; adjust as needed

# Save to JSON
with open("enriched_permit_records.json", "w") as f:
    json.dump(enriched_records, f, indent=2)

print("✅ All records processed and saved.")

##### Embedding and Indexing Records in ChromaDB
- Loads enriched permit records and generates text embeddings using OpenAI’s `text-embedding-3-small` model.
- Each record is stored in a ChromaDB collection with cosine similarity indexing for semantic search.
- Metadata fields such as permit type, class, location, and year are indexed alongside embeddings for efficient filtering and querying.


In [None]:
import json
import chromadb
from chromadb import PersistentClient
import uuid
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
import os
import traceback

openai_client = OpenAI(api_key=os.getenv('OPENAI_KEY'))

# Initialize ChromaDB client
chroma_client = PersistentClient(path="./permits_chroma_storage")


# Create collection with metadata indexing
collection = chroma_client.get_or_create_collection(
    name="permits_vector_data",
    metadata={"hnsw:space": "cosine"}  # ensures cosine similarity
)

def clean(value, fallback=""):
    return value if value is not None else fallback

# Function to get embedding from OpenAI
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# Load records (replace with your actual JSON loading)
with open("enriched_permit_records.json", "r") as f:
    records = json.load(f)

# Prepare and insert into ChromaDB
count = 1
for record in records:
    try:
        print(f"Processing record {count}")
        count+=1
        text_block = json.dumps(record, ensure_ascii=False)
        embedding = get_embedding(text_block)

        metadata = {
            "type_description": clean(record["permit"].get("type_description")),
            "class_mapped": clean(record["permit"].get("class_mapped")),
            "work_class": clean(record["permit"].get("work_class")),
            "calendar_year": clean(record["permit"]["dates"].get("calendar_year"), 0),
            "status": clean(record["permit"]["status"].get("current")),
            "city": clean(record["address"].get("city")),
            "zip": clean(record["address"].get("zip")),
            "latitude": clean(record["location"].get("latitude"), 0.0),
            "longitude": clean(record["location"].get("longitude"), 0.0),
        }

        collection.add(
            documents=[text_block],
            embeddings=[embedding],
            metadatas=[metadata],
            ids=[str(uuid.uuid4())]  # Unique ID for each record
        )
    except:
        traceback.print_exc()

##### 🔍 Semantic Search with Optional Filters
- Defines a function to perform semantic search over embedded permit records using OpenAI embeddings and ChromaDB.
- Supports optional metadata filters (e.g., year, city, permit type) via a dynamic `where` clause.
- Returns top matching records with document content, metadata, and similarity score.


In [11]:
from chromadb import PersistentClient
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
import os

# Initialize ChromaDB client
search_chroma_client = PersistentClient(path="./final_permits_chroma_storage")

# Create collection with metadata indexing
search_collection = search_chroma_client.get_or_create_collection(
    name="permits_vector_data",
    metadata={"hnsw:space": "cosine"}  # ensures cosine similarity
)

# Function to get embedding from OpenAI
openai_client = OpenAI(api_key=os.getenv('OPENAI_KEY'))
def get_embedding(text):
    response = openai_client.embeddings.create(
        input=[text],
        model="text-embedding-3-small"
    )
    return response.data[0].embedding


def search_permits(query: str, filters: dict = None, top_k: int = 5):
    try:
        # Embed the query
        query_embedding = get_embedding(query)

        # Construct `where` clause if filters are provided
        where_clause = None
        if filters:
            and_conditions = []
            for key, value in filters.items():
                # Optional: skip empty values
                if value is not None and str(value).strip() != "":
                    and_conditions.append({key: value})
            if and_conditions:
                where_clause = {"$and": and_conditions}

        # Perform query
        results = search_collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k,
            where=where_clause,
            include=["documents", "metadatas", "distances"]
        )

        # Format output
        output = []
        for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
            output.append({
                "document": doc,
                "metadata": meta,
                "similarity_score": round(dist, 4)
            })
        return output

    except Exception as e:
        raise RuntimeError(f"Query failed: {e}")


- Example Query: Filtered Semantic Search in Action
- Displays matched documents, associated metadata, and similarity scores.


In [26]:
query = "Commercial electrical and plumbing"
filters = {
    # "type_description": "Plumbing Permit",
    # "calendar_year": "2025",
    # "city": "AUSTIN"
}
# filters = None

results = search_permits(query, filters)
for r in results:
    print(f"📄 {r['document']}\n📍 Metadata: {r['metadata']}\n🎯 Distance: {r['similarity_score']}\n")


📄 {"permit": {"type_code": "PP", "type_description": "Plumbing Permit", "number": "2025-002284 PP", "class": "Commercial", "class_mapped": "Commercial", "work_class": "Fireline", "description": "Install Underground Fire Line", "status": {"current": "Expired", "date": "2025-07-17T00:00:00+00:00", "completed": null, "expires": "2025-07-16T00:00:00+00:00"}, "dates": {"applied": "2025-01-08T00:00:00+00:00", "issued": "2025-01-16T00:00:00+00:00", "issue_day": "THURSDAY", "calendar_year": "2025", "fiscal_year": "2025", "recent_30_days": "No"}, "method": "Permit Center", "project_id": "13451791", "master_permit": null, "link": "https://abc.austintexas.gov/web/permit/public-search-other?t_detail=1&t_selected_folderrsn=13451791"}, "address": {"line1": "9741 E US 290 HWY SVRD EB", "city": "AUSTIN", "state": "TX", "zip": "78724", "council_district": null, "jurisdiction": "AUSTIN 2 MILE ETJ"}, "parcel": {"tcad_id": "0226410208", "legal_description": "ABS 690 SUR 54 SANDERS W H ACR 2.1545"}, "locat