In [2]:
import os
import glob
from qdrant_db import QdrantDatabase

In [7]:
import clean
tex_files = glob.glob("bmad_doc/*.tex")
for file in tex_files:
    filename_base = os.path.splitext(os.path.basename(file))[0]
    
    # Create the output file path
    output_file = os.path.join("clean_bmad_doc", f"{filename_base}.txt")
    
    # Process the file
    clean.clean_latex_file(file, output_file)

In [1]:
def chunk_document(text):
    """
    Recursively chunks a document based on chapter-section-subsection hierarchy.
    Preserves all original formatting including newlines.
    
    Args:
        text (str): The document text to chunk
        
    Returns:
        dict: A nested dictionary representing the document structure
    """
    lines = text.split('\n')
    document = {"content": [], "chapters": {}}
    
    current_chapter = None
    current_section = None
    current_subsection = None
    
    for i, line in enumerate(lines):
        if line.startswith('###chapter '):
            # Start a new chapter
            chapter_title = line[len('###chapter '):].strip()
            current_chapter = chapter_title
            current_section = None
            current_subsection = None
            document["chapters"][current_chapter] = {
                "title": chapter_title,
                "content": [],
                "sections": {}
            }
        elif line.startswith('###section '):
            # Start a new section within the current chapter
            if current_chapter is None:
                raise ValueError(f"Section defined before any chapter at line {i+1}: {line}")
                
            section_title = line[len('###section '):].strip()
            current_section = section_title
            current_subsection = None
            document["chapters"][current_chapter]["sections"][current_section] = {
                "title": section_title,
                "content": [],
                "subsections": {}
            }
        elif line.startswith('###subsection '):
            # Start a new subsection within the current section
            if current_section is None:
                raise ValueError(f"Subsection defined before any section at line {i+1}: {line}")
                
            subsection_title = line[len('###subsection '):].strip()
            current_subsection = subsection_title
            document["chapters"][current_chapter]["sections"][current_section]["subsections"][current_subsection] = {
                "title": subsection_title,
                "content": []
            }
        else:
            # Add content to the appropriate level (keeping each line separate)
            if current_subsection is not None:
                document["chapters"][current_chapter]["sections"][current_section]["subsections"][current_subsection]["content"].append(line)
            elif current_section is not None:
                document["chapters"][current_chapter]["sections"][current_section]["content"].append(line)
            elif current_chapter is not None:
                document["chapters"][current_chapter]["content"].append(line)
            else:
                document["content"].append(line)
    
    return document

def get_labeled_chunks(document):
    """
    Convert the nested document structure into a flat list of labeled chunks.
    Preserves original formatting including all newlines.
    
    Args:
        document (dict): The nested document structure
        
    Returns:
        list: A list of (label, content) tuples
    """
    chunks = []
    
    # Add document-level content if any
    if document["content"]:
        chunks.append(("document", "\n".join(document["content"])))
    
    # Process chapters
    for chapter_name, chapter in document["chapters"].items():
        if chapter["content"]:
            chunks.append((f"chapter: {chapter_name}", "\n".join(chapter["content"])))
        
        # Process sections
        for section_name, section in chapter["sections"].items():
            if section["content"]:
                chunks.append((f"chapter: {chapter_name} | section: {section_name}", "\n".join(section["content"])))
            
            # Process subsections
            for subsection_name, subsection in section["subsections"].items():
                if subsection["content"]:
                    chunks.append((
                        f"chapter: {chapter_name} | section: {section_name} | subsection: {subsection_name}",
                        "\n".join(subsection["content"])
                    ))
    
    return chunks

In [3]:
def visualize_document_structure(document):
    """
    Creates a tree-like visualization of the document structure.
    
    Args:
        document (dict): The nested document structure from chunk_document()
        
    Returns:
        str: A formatted string representation of the document structure
    """
    result = []
    
    # Add document-level content
    if document["content"]:
        result.append("Document")
        result.append(f"   ({len(document['content'])} lines of content)")
    
    # Process chapters
    for chapter_name, chapter in document["chapters"].items():
        result.append(f"Chapter: {chapter_name}")
        
        # Add chapter content
        if chapter["content"]:
            result.append(f"   ({len(chapter['content'])} lines of content)")
        
        # Process sections
        for section_name, section in chapter["sections"].items():
            result.append(f"   Section: {section_name}")
            
            # Add section content
            if section["content"]:
                result.append(f"      ({len(section['content'])} lines of content)")
            
            # Process subsections
            for subsection_name, subsection in section["subsections"].items():
                result.append(f"      Subsection: {subsection_name}")
                
                # Add subsection content
                if subsection["content"]:
                    result.append(f"         ({len(subsection['content'])} lines of content)")
    
    return "\n".join(result)

def visualize_document_with_stats(document):
    """
    Visualizes the document structure with additional statistics.
    
    Args:
        document (dict): The nested document structure from chunk_document()
        
    Returns:
        str: A formatted string representation with statistics
    """
    # Gather statistics
    doc_lines = len(document["content"])
    chapter_count = len(document["chapters"])
    section_count = 0
    subsection_count = 0
    total_content_lines = doc_lines
    
    for chapter in document["chapters"].values():
        total_content_lines += len(chapter["content"])
        section_count += len(chapter["sections"])
        
        for section in chapter["sections"].values():
            total_content_lines += len(section["content"])
            subsection_count += len(section["subsections"])
            
            for subsection in section["subsections"].values():
                total_content_lines += len(subsection["content"])
    
    # Generate visualization
    result = [
        "DOCUMENT STRUCTURE SUMMARY",
        f"Total content lines: {total_content_lines}",
        f"Chapters: {chapter_count}",
        f"Sections: {section_count}",
        f"Subsections: {subsection_count}",
        "",
        "DETAILED STRUCTURE",
        "=" * 50
    ]
    
    # Add regular visualization
    result.append(visualize_document_structure(document))
    
    return "\n".join(result)

In [19]:
file_path = "clean_bmad_doc/elements.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    document_text = file.read()
    
# Parse the document
document = chunk_document(document_text)

In [None]:
# Get labeled chunks
#chunks = get_labeled_chunks(doc_structure)
print(visualize_document_structure(document))

In [22]:
len(get_labeled_chunks(document))

59

In [2]:
from langchain.document_loaders import DirectoryLoader, TextLoader

# Example: Load all PDFs from a directory
loader = DirectoryLoader(
    "clean_bmad_doc",
    glob="**/*.txt",
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"}
)
documents = loader.load()

In [6]:
documents[1].metadata

{'source': 'clean_bmad_doc/beam-init.txt'}

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,  # Adjust based on your needs
    chunk_overlap=100,
    separators=[
        
        "\n\n",
        "\n",
        " ",
        ""
    ]
)
texts = text_splitter.split_documents(documents)

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # Lightweight local model
)

In [11]:
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(texts, embeddings)
vector_store.save_local("faiss_tao")

In [6]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vector_store = FAISS.load_local("faiss_clean", embeddings,allow_dangerous_deserialization=True)

In [12]:
retriever = vector_store.as_retriever(search_kwargs={"k": 5})  # Fetch top k results

In [13]:
# Test query
query = "beam tracking"
results = retriever.invoke(query)

# Check retrieved text
for doc in results:
    print('\n!--------------------\n')
    print(doc.page_content)



!--------------------

track_start, track_end 
"track_start" and "track_end" are used when it is desired to only track the beam
through part of the root lattice branch. "track_start" gives the starting element name or
index. Tracking will start at the exit end of this element so the beam \em will not be tracked
through this element. The tracking will end at the exit end of the lattice element with name or
index "track_end". The default, if "track_start" is not given, is to start at
the beginning of the branch The default for "track_end" is the end of the root branch if the 
branch has an open geometry or beam tracking is beginning at the start of the branch. For a root
branch with a closed geometry and with the beam starting in the middle, the tracking will wrap 
around from the branch end to the beginning of the branch and will end up just before the starting point.

After initialization, the "set beam_init"  command can be used to set
"track_start" and "track_end". Note: Deprecated 