In [49]:
import pandas as pd
import numpy as np

import os
from IPython.display import display, Markdown
import re
import json

import chromadb
from chromadb.config import Settings
from chromadb import PersistentClient

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

import openai

In [50]:
from config import TOKEN_HF, OPENAI_API_KEY

In [51]:
ibc_path = './data/IBC-2018 ICC International Building Code conv_02.txt'

with open(ibc_path, 'r') as f:
    raw_text = f.read()

In [52]:
# start reading doc from a line
start_marker = '[A] 101 .1 Title. These regulations shall be known as'
start_index = raw_text.find(start_marker)
if start_index == -1:
    raise ValueError("Start marker not found in the text.")
text = raw_text[start_index:]

# remove nozy lines
lines = text.splitlines()
cleaned_lines = []
for line in lines:
    if 'EDUFIRE.IR' in line or 'Telegram EDUFIRE_IR' in line:
        continue
    if line.isupper() and re.search(r'\b[A-Z]{2,} [A-Z]{2,}', line):
        continue
    cleaned_lines.append(line)

cleaned_text = '\n'.join(cleaned_lines)

# clean: [BF ] 1705 .14 .4 .3 → [BF] 1705.14.4.3
cleaned_text = re.sub(
    r'\[([A-Z]{1,5})\s*\]\s*(\d+)\s*((?:\.\s*\d+)+)',
    lambda m: f"[{m.group(1)}] {m.group(2)}" + ''.join(f".{num}" for num in re.findall(r'\d+', m.group(3))),
    cleaned_text
)

# normalisation: [F] FIRE ALARM SYSTEM → [F] Fire Alarm System
cleaned_text = re.sub(
    r'(\[[A-Z]{1,5}\])\s+([A-Z][A-Z ]{3,})',
    lambda m: f"{m.group(1)} {m.group(2).title()}",
    cleaned_text
)

# remove artifacts
cleaned_text = cleaned_text.replace('?', 'no space')

# remove white spaces
cleaned_text = re.sub(r'(?<=\d)\s+(?=\d)', '', cleaned_text)       # 415. 11 → 415.11
cleaned_text = re.sub(r'(?<=\.)\s+(?=\d)', '', cleaned_text)       # . 1 → .1
cleaned_text = re.sub(r'(?<=\d)\s+\.(?=\d)', r'.', cleaned_text)   # 1 .3 → 1.3

# cut to paragraphs
paragraphs = re.split(
    r'(?=\[[A-Z]{1,5}\](?: \d+\.\d+(?:\.\d+)*)'
    r'|(?<=\]) [A-Z][a-z]+(?: [A-Z][a-z]+)*'
    r'|\n\d+\.\d+(?:\.\d+)+)',
    cleaned_text
)

paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 10]

# join paragraph lines
joined_paragraphs = []
for para in paragraphs:
    lines = para.splitlines()
    joined = ' '.join(line.strip() for line in lines)
    joined = re.sub(r'\s+([.,:;!?])', r'\1', joined)
    joined_paragraphs.append(joined)

joined_paragraphs[:5]


['[A] 101.1 Title. These regulations shall be known as the Building Code of [NAME OF JUR ISD ICTION ], hereinafter referred to as “this code. ”',
 '[A] 101.2 Scope. The provisions of this code shall apply to the construction, alteration, relocation, enlargement, replaceno spacement, repair, equipment, use and occupancy, location, mainno spacetenance, removal and demolition of every building or structure or any appurtenances connected or attached to such buildings or structures. Exception: Detached one- and two-family dwellings and townhouses not more than three stories above grade plane in height with a separate means of egress, and their accesno spacesory structures not more than three stories above grade plane in height, shall comply with this code or the Internano spacetional Residential Code.',
 '[A] 101.2.1 Appendices. Provisions in the appendices shall not apply unless specifically adopted.',
 '[A] 101.3 Intent. The purpose of this code is to establish the minimum requirements to

In [53]:
# split large paragraphs to smaller ones
def split_large_paragraph(para: str, max_length: int = 1000):
    if len(para) <= max_length:
        return [para]

    sentences = re.split(r'(?<=[.!?])\s+', para)
    chunks = []
    buffer = ""

    for sentence in sentences:
        if len(buffer) + len(sentence) + 1 <= max_length:
            buffer += sentence + " "
        else:
            if buffer:
                chunks.append(buffer.strip())
            buffer = sentence + " "
    
    if buffer:
        chunks.append(buffer.strip())

    return chunks

In [54]:
structured_paragraphs = []
last_prefix = None

for para in joined_paragraphs:
    numeric_match = re.search(r'\[([A-Z]{1,4})\]\s*(\d+(?:\.\d+)*)', para)
    header_match = re.match(r'\[([A-Z]{1,4})\]\s+([A-Z][a-z]+(?: [A-Z][a-z]+)*)', para)
    bare_match = re.match(r'^(\d+(?:\.\d+)+)', para)
    number_only = re.fullmatch(r'\d+(\.\d+)+\.?', para.strip())

    if number_only:
        continue

    if numeric_match:
        letter = numeric_match.group(1)
        numbers = numeric_match.group(2) # Keep the original numbers with dots
        index = f"[{letter}]{numbers}"   # Add square brackets
        last_prefix = letter
        structured_paragraphs.append((index, para, True))
    elif header_match:
        letter = header_match.group(1)
        title = header_match.group(2).strip().lower().replace(' ', '_')
        index = f"[{letter}]_{title}" # Add square brackets to the letter
        last_prefix = letter
        structured_paragraphs.append((index, para, True))
    elif bare_match:
        numbers = bare_match.group(1) # Keep the original numbers with dots
        letter = last_prefix if last_prefix else 'NOLETTER'
        index = f"[{letter}]{numbers}" # Add square brackets
        structured_paragraphs.append((index, para, True))
    else:
        structured_paragraphs.append((None, para, False))

In [55]:
structured_paragraphs[:5]

[('[A]101.1',
  '[A] 101.1 Title. These regulations shall be known as the Building Code of [NAME OF JUR ISD ICTION ], hereinafter referred to as “this code. ”',
  True),
 ('[A]101.2',
  '[A] 101.2 Scope. The provisions of this code shall apply to the construction, alteration, relocation, enlargement, replaceno spacement, repair, equipment, use and occupancy, location, mainno spacetenance, removal and demolition of every building or structure or any appurtenances connected or attached to such buildings or structures. Exception: Detached one- and two-family dwellings and townhouses not more than three stories above grade plane in height with a separate means of egress, and their accesno spacesory structures not more than three stories above grade plane in height, shall comply with this code or the Internano spacetional Residential Code.',
  True),
 ('[A]101.2.1',
  '[A] 101.2.1 Appendices. Provisions in the appendices shall not apply unless specifically adopted.',
  True),
 ('[A]101.3',


In [56]:
# index generation
indexed_dict = {}
unindexed_count = 0
last_index = None
suffix_counter = {}

for index, para, is_indexed in structured_paragraphs:
    chunks = split_large_paragraph(para)
    if is_indexed:
        last_index = index
        suffix_counter[last_index] = 0
        for i, chunk in enumerate(chunks, start=1):
            chunk_index = f"{index}_{i}" if len(chunks) > 1 else index
            while chunk_index in indexed_dict:
                chunk_index += "_dup"
            indexed_dict[chunk_index] = chunk
    else:
        if last_index:
            suffix_counter[last_index] += 1
            new_index = f"{last_index}_{chr(96 + suffix_counter[last_index])}"
            indexed_dict[new_index] = para
        else:
            fallback_index = f"UNINDEXED_{unindexed_count}"
            indexed_dict[fallback_index] = para
            unindexed_count += 1

with open('./data/ibc_fully_indexed.json', 'w', encoding='utf-8') as f:
    json.dump(indexed_dict, f, indent=2, ensure_ascii=False)

print(f"total paragraphs indexed: {len(indexed_dict)}")
for k in list(indexed_dict.keys())[:5]:
    print(f"{k}:\n{indexed_dict[k]}\n")

total paragraphs indexed: 4685
[A]101.1:
[A] 101.1 Title. These regulations shall be known as the Building Code of [NAME OF JUR ISD ICTION ], hereinafter referred to as “this code. ”

[A]101.2:
[A] 101.2 Scope. The provisions of this code shall apply to the construction, alteration, relocation, enlargement, replaceno spacement, repair, equipment, use and occupancy, location, mainno spacetenance, removal and demolition of every building or structure or any appurtenances connected or attached to such buildings or structures. Exception: Detached one- and two-family dwellings and townhouses not more than three stories above grade plane in height with a separate means of egress, and their accesno spacesory structures not more than three stories above grade plane in height, shall comply with this code or the Internano spacetional Residential Code.

[A]101.2.1:
[A] 101.2.1 Appendices. Provisions in the appendices shall not apply unless specifically adopted.

[A]101.3:
[A] 101.3 Intent. The pu

In [57]:
indexed_dict['[A]101.4']

'[A] 101.4 Referenced codes. The other codes listed in Secno spacetions 101.4.1 through 101.4.7 and referenced elsewhere in this code shall be considered to be part of the requirements of this code to the prescribed extent of each such reference.'

In [58]:
# prepare docs
documents = list(indexed_dict.values())
ids = [f"ibc_{k}" for k in indexed_dict.keys()]
metadatas = [{"index": k} for k in indexed_dict.keys()]

# SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# connect to ChromaDB
client = PersistentClient(path="./data_base/chroma_db_ibc_04")

collection = client.get_or_create_collection(
    name="ibc_2018",
    embedding_function=embedding_function
)

# add collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"{len(documents)} loaded paragraphs to 'ibc_2018'.")

4685 loaded paragraphs to 'ibc_2018'.


In [59]:
def search_ibc(query: str, top_k: int = 10):
    results = collection.query(
        query_texts=[query],
        n_results=top_k
    )

    docs = results["documents"][0]
    metas = results["metadatas"][0]
    ids = results["ids"][0]
    distances = results.get("distances", [[None]*top_k])[0]

    for i, (doc, meta, id_, dist) in enumerate(zip(docs, metas, ids, distances)):
        # the less distance — the closer result (from 0 to 1)
        closeness = 1 - dist if dist is not None else 0
        bar_length = int(closeness * 20)
        bar = '█' * bar_length + '░' * (20 - bar_length)

        print(f"=== Result {i+1}:")
        # print(f"Index: {meta.get('index', 'unknown')}")
        print(f"ID: {id_}")
        print(f"distance: {dist:.3f} | closeness: {closeness:.3f} {bar}")
        print(doc.strip())
        print(len(doc))
        print("-" * 80)

In [60]:
search_ibc("what is the minimum door width in apartment")

=== Result 1:
ID: ibc_[F]903.3.1.1_8_dup
distance: 0.565 | closeness: 0.435 ████████░░░░░░░░░░░░
In Group I-3, door openings to resident sleeping units that are not required to be an Accessible unit shall have a minimum clear opening w idth of 28 inches (711 mm).3. Door openings to storage closets less than 10 square feet (0.93 m 2 ) in area shall not be limited by the minimum clear opening width.4. The width of door leaves in revolving doors that comply with S ection 1010.1.4.1 shall not be limited.5. The maximum width of door leaves in powerno spaceoperated doors that comply with S ection
500
--------------------------------------------------------------------------------
=== Result 2:
ID: ibc_[F]1010.1.4.2_1
distance: 0.598 | closeness: 0.402 ████████░░░░░░░░░░░░
1010.1.4.2 shall not be limited.6. Door openings within a dwelling unit or sleepno spaceing unit shall have a minimum clear opening height of 78 inches (1981 mm).7. In dwelling and sleeping units that are not required to be

In [61]:
reranker_model_name = "cross-encoder/ms-marco-MiniLM-L6-v2"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name, use_auth_token=TOKEN_HF)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name, use_auth_token=TOKEN_HF)

In [62]:
# adjust search function for the use in LLM 
def search_ibc(query: str, top_k: int = 10):
    results = collection.query(query_texts=[query], n_results=top_k)
    
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    ids = results["ids"][0]
    distances = results.get("distances", [[None]*top_k])[0]

    final_results = []
    for doc, meta, id_, dist in zip(docs, metas, ids, distances):
        index = meta.get("index", "unknown")
        final_results.append({
            "text": doc,
            "index": index,
            "score": 1 - dist if dist is not None else None
        })

    return final_results

In [63]:
openai.api_key = OPENAI_API_KEY

In [64]:
# using open ai
def generate_ibc_answer_gpt4(query: str, top_k: int = 5):
    results = search_ibc(query, top_k=top_k)

    context = ""
    for r in results:
        context += f"(§{r['index']}) {r['text']}\n\n"

    prompt = f"""
You are a helpful assistant that answers questions using the context from the International Building Code (IBC).

Context:
{context}

Question:
{query}

Answer in a clear and human-friendly way. Use complete sentences and cite section numbers where appropriate (e.g., "according to §A1012").
"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that explains building code requirements clearly."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.1,
        max_tokens=600
    )

    return response["choices"][0]["message"]["content"].strip()

In [65]:
answer = generate_ibc_answer_gpt4("minimum corridor width in apartment buildings", top_k=10)
display(Markdown(answer))

The minimum corridor width in apartment buildings, according to the International Building Code, depends on the specific occupancy group and other factors. However, as a general rule, the IBC requires a minimum corridor width of 44 inches (1118 mm) for egress purposes (§[F]1029.16.3_15). This width must be maintained and lead directly from the area to the exit without obstructions. 

For service corridors, the clear width should be not less than 5 feet (1524 mm), or 33 inches (838 mm) wider than the widest cart or truck used in the service corridor, whichever is greater (§[F]415.11.3.4).

In Group I-2, Condition 1 occupancies, where the corridor width is not less than 96 inches (2440 mm), projections for furniture are permitted under certain conditions (§[F]710.407.3.1_3).

Please note that these are general guidelines and specific requirements may vary based on the building's design, use, and local amendments to the IBC. Always consult with a local building official or a qualified architect or engineer to ensure compliance with all applicable building codes.

In [66]:
answer = generate_ibc_answer_gpt4("what is the minimum door width in apartment", top_k=10)
display(Markdown(answer))

The minimum door width in an apartment, according to the International Building Code, depends on the specific use of the door. 

For general door openings within a dwelling unit or sleeping unit, the minimum clear opening width is not specified unless the unit is required to be Accessible, Type A or Type B. In those cases, door openings required to be accessible within Type B units intended for user passage should have a minimum clear opening width of 31.75 inches (806 mm) as per §[F]903.3.1.1_8_dup.

However, for doorways with swinging doors, the clear opening width should be measured between the face of the door and the stop, with the door open 90 degrees. Where a minimum clear opening width of 32 inches (813 mm) is required and a door opening includes two door leaves without a mullion, one leaf should provide a minimum clear opening width of 32 inches (813 mm) as per §[F]903.3.1.1_7_dup.

Please note that these are general guidelines and specific requirements may vary based on the specific design and use of the apartment. Always consult with a professional or local building authority to ensure compliance with all applicable building codes.

In [67]:
# option with rerank
reranker_model_name = "cross-encoder/ms-marco-MiniLM-L6-v2"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name, use_auth_token=TOKEN_HF)
reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model_name, use_auth_token=TOKEN_HF)
reranker_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
reranker_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, e

In [68]:
def rerank(query: str, docs: list[str], top_k: int = 5) -> list[tuple[str, float]]:
    pairs = [(query, doc) for doc in docs]
    
    with torch.no_grad():
        inputs = reranker_tokenizer(
            [q for q, d in pairs],
            [d for q, d in pairs],
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(device)

        logits = reranker_model(**inputs).logits
        scores = logits.squeeze(-1)  # [batch] instead of [batch, 1]

    scored = [(doc, float(score)) for doc, score in zip(docs, scores)]
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_k]

In [69]:
def search_ibc(query: str, top_k: int = 10, rerank_k: int = 5, use_rerank: bool = True):
    results = collection.query(query_texts=[query], n_results=top_k)
    
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    ids = results["ids"][0]
    distances = results.get("distances", [[None]*top_k])[0]

    raw_results = []
    for doc, meta, id_, dist in zip(docs, metas, ids, distances):
        index = meta.get("index", "unknown")
        raw_results.append({
            "text": doc,
            "index": index,
            "score": 1 - dist if dist is not None else None
        })

    if use_rerank:
        reranked = rerank(query, [r["text"] for r in raw_results], top_k=rerank_k)
        final_results = []
        for doc, score in reranked:
            for r in raw_results:
                if r["text"] == doc:
                    final_results.append({
                        "text": doc,
                        "index": r["index"],
                        "score": score
                    })
                    break
        return final_results
    else:
        return raw_results[:rerank_k]

In [70]:
results = search_ibc("width of exit stair", top_k=20, rerank_k=5, use_rerank=True)

for res in results:
    print(f"§{res['index']} | score={res['score']:.3f}")
    print(res['text'][:300], '...\n')

§[F]1012.1009.3 | score=6.124
1012.1009.3 Stairways. In order to be considered part of an accesno spacesible means of egress, a stairway between stories shall comno spaceply with Sections 1009.3.1 through 1009.3.3.1009.3.1 Exit acess stairways. Exit access stairways that connect levels in the same story are not permitted as part ...

§[F]1029.16.3_7 | score=4.517
In Group F occupancies where exit access stairways serve fewer than three stories and such stairways are not open to the public, and where the top of the guard also serves as a handrail, the top of the guard shall be not less than 34 inches (864 mm) and not more than 38 inches (965 mm) measured vert ...

§[F]903.3.1.1_1_dup_dup_dup_dup_dup_dup | score=3.861
903.3.1.1 or 903.3.1.2, or separated from the egress path in accordance with the requireno spacements for the enclosure of interior exit stairno spaceways or ramps.1.4. Where a required interior exit stairway or ramp and an exit access stairway or ramp serve the same floor 

In [71]:
def generate_ibc_answer_gpt4(query: str, top_k: int = 5):
    results = search_ibc(query, top_k=top_k, rerank_k=top_k, use_rerank=True)
    context = ""
    for r in results:
        context += f"(§{r['index']}) {r['text']}\n\n"

    prompt = f"""
You are an expert assistant answering questions about the International Building Code (IBC). 
Use only the provided context to answer precisely and clearly.

Context:
{context}

Question:
{query}

Answer clearly in natural language. Use complete sentences and cite sections (e.g., "according to §A1012") when possible.
"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant knowledgeable in architectural codes."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
        max_tokens=600
    )

    return response["choices"][0]["message"]["content"].strip()

In [78]:
answer = generate_ibc_answer_gpt4("width of stairway", top_k=5)
display(Markdown(answer))

The width of the stairway depends on the specific circumstances. According to §[F]5.2.1_3, stairways serving an occupant load of less than 50 should have a width of not less than 36 inches (914 mm). However, if an incline platform lift or stairway chairlift is installed on stairways serving occupancies in Group R-3, or within dwelling units in occupancies in Group R-2, a clear passage width not less than 20 inches (508 mm) should be provided. 

In the case of stairways being part of an accessible means of egress, they should have a clear width of 48 inches (1219 mm) minimum between handrails as per §[F]1012.1009.3. However, this requirement is not necessary in buildings equipped with an automatic sprinkler system installed in accordance with Section 903.3.1.1.

For spiral stairways, as per §[F]1011.10.1011.6_5, the minimum stairway clear width at and below the handrail should be 26 inches (660 mm). 

Lastly, according to §[F]903.3.1.1_1_dup_dup_dup_dup, stairways that are part of the means of egress for the exterior area for assisted rescue should provide a minimum clear width of 48 inches (1220 mm) between handrails, unless the building is equipped with an automatic sprinkler system installed in accordance with Section 903.3.1.1 or 903.3.1.2.

In [73]:
answer = generate_ibc_answer_gpt4("width of apartment door", top_k=5)
display(Markdown(answer))

The width of an apartment door, as per the International Building Code, depends on the specific requirements of the unit. If the apartment is in Group I-2 and the door is used for the movement of beds, it should provide a minimum clear opening width of 41 1/2 inches (1054 mm) as per §[F]903.3.1.1_7_dup. 

If the apartment is not required to be an Accessible, Type A, or Type B unit, the minimum clear opening widths do not apply to interior egress doors as per §[F]1010.1.4.2_1. 

However, door openings required to be accessible within Type B units intended for user passage should have a minimum clear opening width of 31.75 inches (806 mm) as per the same section. 

In Group I-3, door openings to resident sleeping units that are not required to be an Accessible unit should have a minimum clear opening width of 28 inches (711 mm) as per §[F]903.3.1.1_8_dup. 

Please note that these are the minimum requirements and the actual door width can be larger based on the design and functional requirements of the apartment.