In [1]:
# Import required libraries
import xml.etree.ElementTree as ET

xml_path = "/Users/vaibhavnakrani/usa_laws/xml_uscAll@118-157/usc01.xml"

# Register the namespace
ET.register_namespace('uslm', 'http://xml.house.gov/schemas/uslm/1.0')
ns = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}

# Load and parse XML
tree = ET.parse(xml_path)
root = tree.getroot()

# Create new root
new_root = ET.Element("uscode")

# Create main content section
main_content = ET.SubElement(new_root, "mainContent")

# Find section element (it should be in main)
section = root.find('.//uslm:section', ns)

# Get title information
num = section.find('uslm:num', ns).text
heading = section.find('uslm:heading', ns).text
title = ET.SubElement(main_content, "title")
title.text = f"{num} {heading}"

# Print what we found so far
print("Title:", title.text)

Title: § 1.  Words denoting number, gender, and so forth


In [2]:
# Create text content
text = ET.SubElement(main_content, "text")

# Find all paragraphs in content
content_elem = section.find('.//uslm:content', ns)
text_parts = []
for p in content_elem.findall('.//uslm:p', ns):
    text_parts.append(p.text.strip())

# Join text with newlines
text.text = "\n".join(text_parts)

# Add source credit
source_credit = ET.SubElement(main_content, "sourceCredit")
source_credit_elem = section.find('.//uslm:sourceCredit', ns)
source_credit.text = ''.join(source_credit_elem.itertext()).strip()

# Print what we extracted
print("=== Text Content ===")
print(text.text[:200] + "...")
print("\n=== Source Credit ===")
print(source_credit.text)

=== Text Content ===
In determining the meaning of any Act of Congress, unless the context indicates otherwise—
words importing the singular include and apply to several persons, parties, or things;
words importing the pl...

=== Source Credit ===
(July 30, 1947, ch. 388, 61 Stat. 633; June 25, 1948, ch. 645, § 6, 62 Stat. 859; Oct. 31, 1951, ch. 655, § 1, 65 Stat. 710; Pub. L. 112–231, § 2(a), Dec. 28, 2012, 126 Stat. 1619.)


In [3]:
# Find the notes section
notes_section = section.find('.//uslm:notes', ns)

# Create editorial notes section in our output
editorial_notes = ET.SubElement(main_content, "editorialNotes")

# Initialize lists to store different types of notes
amendments = []
misc_notes = []

if notes_section is not None:
    # Process each note
    for note in notes_section.findall('.//uslm:note', ns):
        # Get note topic
        topic = note.get('topic', '')
        
        # Get note heading if it exists
        heading_elem = note.find('.//uslm:heading', ns)
        heading = heading_elem.text if heading_elem is not None else ''
        
        # Get all text content from paragraphs
        paragraphs = note.findall('.//uslm:p', ns)
        content = []
        for p in paragraphs:
            # Get all text content including nested elements
            text = ''.join(p.itertext()).strip()
            if text:
                content.append(text)
        
        # Combine paragraphs with newlines
        full_content = '\n'.join(content)
        
        # Sort notes by type
        if topic == 'amendments':
            amendments.append(f"=== {heading} ===\n{full_content}")
        else:
            if heading:
                misc_notes.append(f"=== {heading} ===\n{full_content}")
            else:
                misc_notes.append(full_content)

# Add amendments section if we found any
if amendments:
    amendments_elem = ET.SubElement(editorial_notes, "amendments")
    amendments_elem.text = '\n\n'.join(amendments)
    print("\n=== Amendments ===")
    print(amendments_elem.text[:500] + "...")

# Add miscellaneous notes if we found any
if misc_notes:
    misc_elem = ET.SubElement(editorial_notes, "miscellaneous")
    misc_elem.text = '\n\n'.join(misc_notes)
    print("\n=== Miscellaneous Notes ===")
    print(misc_elem.text + "...")


=== Amendments ===
=== Amendments ===
2012—Pub. L. 112–231, in fifth clause after opening clause, struck out “and ‘lunatic’ ” before “shall include every” and “lunatic,” before “insane person,”.
1951—Act Oct. 31, 1951, substituted, in fourth clause after opening clause, “used” for “use”.
1948—Act June 25, 1948, included “tense”, “whoever”, “signature”, “subscription”, “writing” and a broader definition of “person”....

=== Miscellaneous Notes ===




=== Short Title of 2022 Amendment ===
Pub. L. 117–228, § 1, Dec. 13, 2022, 136 Stat. 2305, provided that: “This Act [enacting section 1738C of Title 28, Judiciary and Judicial Procedure, amending section 7 of this title, repealing section 1738C of Title 28, and enacting provisions set out as notes under section 7 of this title] may be cited as the ‘Respect for Marriage Act’.”

=== Short Title of 2012 Amendment ===
Pub. L. 112–231, § 1, Dec. 28, 2012, 126 Stat. 1619, provided that: “This Act [amending this section and sections 92a, 215, an

In [29]:
# Import required libraries
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re

def parse_and_save_xml(xml_path):

    # Extract title number from path
    title_number = re.search(r'usc(\d+[A-Za-z]?)\.xml', xml_path).group(1).lstrip('0')

    ET.register_namespace('uslm', 'http://xml.house.gov/schemas/uslm/1.0')
    ns = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Create single root for all sections
    new_root = ET.Element("usCode")

    # Add info tag with title number
    info = ET.SubElement(new_root, "title")
    info.text = f"{title_number}"

    # Find all sections
    sections = root.findall('.//uslm:section', ns)
    print(f"Found {len(sections)} sections")

    # Process each section
    for idx, section in enumerate(sections, 1):
        # Create section element
        section_elem = ET.SubElement(new_root, "section")
        
        # Process main content
        content = ET.SubElement(section_elem, "content")
        
        # Add section info and intro as text
        section_text = []
        
        # Get section number and heading
        section_num = section.find('uslm:num', ns)
        section_heading = section.find('uslm:heading', ns)
        
        if section_num is not None:
            section_text.append(f"Number: {section_num.text}")
        if section_heading is not None:
            section_text.append(f"Title: {section_heading.text}")
        
        # Add introduction
        intro_para = section.find('.//uslm:content//uslm:p[@style="-uslm-lc:I11"]', ns)
        if intro_para is not None and intro_para.text:
            section_text.append(intro_para.text.strip())
        
        # Add definitions/content paragraphs
        for p in section.findall('.//uslm:content//uslm:p[@style="-uslm-lc:I12"]', ns):
            if p.text:
                section_text.append(p.text.strip())
        
        # Add source
        source_elem = section.find('.//uslm:sourceCredit', ns)
        if source_elem is not None:
            section_text.append('Source:')
            section_text.append(''.join(source_elem.itertext()).strip())
        
        # Combine all text with newlines
        content.text = '\n\n'.join(section_text)
        
        # Process notes section
        notes_section = section.find('.//uslm:notes', ns)
        if notes_section is not None:
            # Add amendments
            amendments = ET.SubElement(section_elem, "amendments")
            for note in notes_section.findall('.//uslm:note[@topic="amendments"]//uslm:p', ns):
                amend = ET.SubElement(amendments, "amend")
                amend.text = ''.join(note.itertext()).strip()
            
            # Add other notes
            # Add notes
            notes = ET.SubElement(section_elem, "notes")
            for note in notes_section.findall('.//uslm:note[@topic="miscellaneous"]', ns):
                note_text = []
                heading = note.find('.//uslm:heading', ns)
                if heading is not None and heading.text is not None:
                    note_text.append(heading.text.strip())
                for p in note.findall('.//uslm:p', ns):
                    text_content = ''.join(p.itertext()).strip()
                    if text_content:  # Only append if there's actual content
                        note_text.append(text_content)
                if note_text:
                    n = ET.SubElement(notes, "note")
                    n.text = '\n'.join(note_text)
        
        print(f"Processed section {idx}/{len(sections)}")

    # Format and save to single XML file
    xml_str = ET.tostring(new_root, encoding='unicode')
    formatted_xml = minidom.parseString(xml_str).toprettyxml(indent="  ")

    # Save all sections to one file
    with open('uscode_title1_complete.xml', 'w', encoding='utf-8') as f:
        f.write(formatted_xml)

    print("\nProcessing complete!")
    print("All sections saved to: uscode_title1_complete.xml")

import os

folder_path = "/Users/vaibhavnakrani/usa_laws/xml_uscAll@118-157"

# Process each XML file in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.startswith('usc') and filename.endswith('.xml'):
        xml_path = os.path.join(folder_path, filename)
        parse_and_save_xml(xml_path)

Found 53 sections
Processed section 1/53
Processed section 2/53
Processed section 3/53
Processed section 4/53
Processed section 5/53
Processed section 6/53
Processed section 7/53
Processed section 8/53
Processed section 9/53
Processed section 10/53
Processed section 11/53
Processed section 12/53
Processed section 13/53
Processed section 14/53
Processed section 15/53
Processed section 16/53
Processed section 17/53
Processed section 18/53
Processed section 19/53
Processed section 20/53
Processed section 21/53
Processed section 22/53
Processed section 23/53
Processed section 24/53
Processed section 25/53
Processed section 26/53
Processed section 27/53
Processed section 28/53
Processed section 29/53
Processed section 30/53
Processed section 31/53
Processed section 32/53
Processed section 33/53
Processed section 34/53
Processed section 35/53
Processed section 36/53
Processed section 37/53
Processed section 38/53
Processed section 39/53
Processed section 40/53
Processed section 41/53
Process

In [None]:
# Import required libraries
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import os
from pathlib import Path

def process_xml_file(xml_path):
    # Extract title number from path
    title_number = re.search(r'usc(\d+)\.xml', xml_path).group(1)
    title_number = title_number.lstrip('0')  # Remove leading zeros
    
    ET.register_namespace('uslm', 'http://xml.house.gov/schemas/uslm/1.0')
    ns = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0'}
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Create usCode element for this title
    us_code = ET.Element("usCode")
    
    # Add title number
    title_info = ET.SubElement(us_code, "title")
    title_info.text = title_number
    
    # Find all sections
    sections = root.findall('.//uslm:section', ns)
    print(f"Processing Title {title_number}: Found {len(sections)} sections")
    
    # Process each section
    for idx, section in enumerate(sections, 1):
        # Create section element
        section_elem = ET.SubElement(us_code, "section")
        
        # Process main content
        content = ET.SubElement(section_elem, "content")
        
        # Add section info and intro as text
        section_text = []
        
        # Get section number and heading
        section_num = section.find('uslm:num', ns)
        section_heading = section.find('uslm:heading', ns)
        
        if section_num is not None:
            section_text.append(f"Number: {section_num.text}")
        if section_heading is not None:
            section_text.append(f"Title: {section_heading.text}")
        
        # Add introduction
        intro_para = section.find('.//uslm:content//uslm:p[@style="-uslm-lc:I11"]', ns)
        if intro_para is not None and intro_para.text:
            section_text.append(intro_para.text.strip())
        
        # Add definitions/content paragraphs
        for p in section.findall('.//uslm:content//uslm:p[@style="-uslm-lc:I12"]', ns):
            if p.text:
                section_text.append(p.text.strip())
        
        # Add source
        source_elem = section.find('.//uslm:sourceCredit', ns)
        if source_elem is not None:
            section_text.append('Source:')
            section_text.append(''.join(source_elem.itertext()).strip())
        
        # Combine all text with newlines
        content.text = '\n\n'.join(section_text)
        
        # Process notes section
        notes_section = section.find('.//uslm:notes', ns)
        if notes_section is not None:
            # Add amendments
            amendments = ET.SubElement(section_elem, "amendments")
            for note in notes_section.findall('.//uslm:note[@topic="amendments"]//uslm:p', ns):
                amend = ET.SubElement(amendments, "amend")
                amend.text = ''.join(note.itertext()).strip()
            
            # Add other notes
            notes = ET.SubElement(section_elem, "notes")
            for note in notes_section.findall('.//uslm:note[@topic="miscellaneous"]', ns):
                note_text = []
                heading = note.find('.//uslm:heading', ns)
                if heading is not None:
                    print(title.text, section_num.text)
                    note_text.append(heading.text.strip())
                for p in note.findall('.//uslm:p', ns):
                    note_text.append(''.join(p.itertext()).strip())
                if note_text:
                    n = ET.SubElement(notes, "note")
                    n.text = '\n'.join(note_text)
        
        print(f"Processed section {idx}/{len(sections)}")
    
    return us_code


# Create single root for all usCode elements
complete_root = ET.Element("completeUSCode")

# Specify the folder containing XML files
folder_path = "/Users/vaibhavnakrani/usa_laws/xml_uscAll@118-157"

# Process each XML file in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.startswith('usc') and filename.endswith('.xml'):
        xml_path = os.path.join(folder_path, filename)
        us_code_element = process_xml_file(xml_path)
        complete_root.append(us_code_element)

# Format and save to single XML file
xml_str = ET.tostring(complete_root, encoding='unicode')
formatted_xml = minidom.parseString(xml_str).toprettyxml(indent="  ")

# Save all titles to one file
output_file = 'complete_uscode.xml'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(formatted_xml)

print("\nProcessing complete!")
print(f"All titles saved to: {output_file}")

### Entire section XML

In [None]:
import xml.etree.ElementTree as ET

xml_path = "/Users/vaibhavnakrani/usa_laws/uscode_title1_complete.xml"
tree = ET.parse(xml_path)
root = tree.getroot()
sections = root.findall('section')

for idx, section in enumerate(sections, 1):
    # Convert section element to string
    section_str = ET.tostring(section, encoding='unicode', method='xml')
    print(f"\n=== Section {idx} ===")
    print(section_str)
    # Or if you want just the text content:
    content = section.find('content')
    if content is not None and content.text:
        print(f"\n=== Section {idx} Content Only ===")
        print(content.text)

### Full Text combined

In [10]:
xml_path = "/Users/vaibhavnakrani/usa_laws/uscode_title1_complete.xml"
tree = ET.parse(xml_path)
root = tree.getroot()
sections = root.findall('section')

for idx, section in enumerate(sections, 1):
    # Get all text content
    all_text = []
    
    # Get main content
    content = section.find('content')
    if content is not None and content.text:
        all_text.append(content.text)
    
    # Get amendments
    amendments = section.find('amendments')
    if amendments is not None:
        for amend in amendments.findall('amend'):
            if amend.text:
                all_text.append(amend.text)
    
    # Get notes
    notes = section.find('notes')
    if notes is not None:
        for note in notes.findall('note'):
            if note.text:
                all_text.append(note.text)
    
    # Combine all text with newlines
    full_text = '\n\n'.join(all_text)
    
    print(f"\n=== Section {idx} Complete Text ===")
    print(full_text)
    print("\n" + "="*50)


=== Section 1 Complete Text ===
Number: § 1.

Title:  Words denoting number, gender, and so forth

In determining the meaning of any Act of Congress, unless the context indicates otherwise—

words importing the singular include and apply to several persons, parties, or things;

words importing the plural include the singular;

words importing the masculine gender include the feminine as well;

words used in the present tense include the future as well as the present;

the words “insane” and “insane person” shall include every idiot, insane person, and person non compos mentis;

the words “person” and “whoever” include corporations, companies, associations, firms, partnerships, societies, and joint stock companies, as well as individuals;

“officer” includes any person authorized by law to perform the duties of the office;

“signature” or “subscription” includes a mark when the person making the same intended it as such;

“oath” includes affirmation, and “sworn” includes affirmed;

“wr

### Parsing XHTML

In [11]:
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

def get_note_content(start_elem, section_boundary):
    """Collect all text content until next note-head or section boundary"""
    texts = []
    current = start_elem.find_next()
    
    while current and current != section_boundary:
        if (current.name == 'p' and any(cls.startswith('note-body') for cls in current.get('class', []))) or \
           (current.name == 'h4' and 'note-sub-head' in current.get('class', [])):
            texts.append(current.get_text().strip())
            
        if current.name == 'h4' and 'note-head' in current.get('class', []):
            break
            
        current = current.find_next()
    
    return ' '.join(texts), current

def parse_and_save_xhtml(html_path, output_file='uscode_parsed.xml'):
    with open(html_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file.read(), 'html.parser')
    
    root = ET.Element("usCode")
    
    # Get all section heads to determine boundaries
    section_heads = soup.find_all('h3', class_='section-head')
    
    for i, section_head in enumerate(section_heads):
        section = ET.SubElement(root, "section")
        
        # Get section number and title
        text = section_head.get_text().strip()
        number, title = text.replace('§', '').split('.', 1)
        ET.SubElement(section, "number").text = number.strip()
        ET.SubElement(section, "title").text = title.strip()
        
        # Get next section head as boundary or None if last section
        next_section = section_heads[i + 1] if i + 1 < len(section_heads) else None
        
        # Initialize lists
        content_texts = []
        notes_list = []
        next_elem = section_head.find_next()
        
        while next_elem and next_elem != next_section:
            if next_elem.name == 'p':
                classes = next_elem.get('class', [])
                if any(cls.startswith('statutory-body') for cls in classes):
                    content_texts.append(next_elem.get_text().strip())
                elif 'source-credit' in classes:
                    source = ET.SubElement(section, "source")
                    source.text = next_elem.get_text().strip()
            elif next_elem.name == 'h4' and 'note-head' in next_elem.get('class', []):
                note_type = next_elem.get_text().strip()
                note_text, next_elem = get_note_content(next_elem, next_section)
                if note_text:  # Only add if we got some content
                    notes_list.append((note_type, note_text))
                continue
            
            next_elem = next_elem.find_next()
        
        # Add content first
        if content_texts:
            content = ET.SubElement(section, "content")
            content.text = ' '.join(content_texts)
        
        # Add notes wrapper and all notes
        if notes_list:
            notes_wrapper = ET.SubElement(section, "notes")
            for note_type, note_text in notes_list:
                note = ET.SubElement(notes_wrapper, "note")
                note.set('type', note_type)
                note.text = note_text
    
    # Save XML file
    tree = ET.ElementTree(root)
    ET.indent(tree, space="  ")
    tree.write(output_file, encoding='utf-8', xml_declaration=True)

# Example usage
if __name__ == "__main__":
    html_path = "/Users/vaibhavnakrani/usa_laws/PRELIMusc54.htm"
    parse_and_save_xhtml(html_path)

In [None]:
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import os
import re
import chardet

def detect_encoding(file_path):
    """Detect the file encoding using chardet"""
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        return result['encoding']

def get_note_content(start_elem, section_boundary):
    """Collect all text content until next note-head or section boundary"""
    texts = []
    current = start_elem.find_next()
    
    while current and current != section_boundary:
        if (current.name == 'p' and any(cls.startswith('note-body') for cls in current.get('class', []))) or \
           (current.name == 'h4' and 'note-sub-head' in current.get('class', [])):
            texts.append(current.get_text().strip())
            
        if current.name == 'h4' and 'note-head' in current.get('class', []):
            break
            
        current = current.find_next()
    
    return ' '.join(texts), current

def parse_html_file(html_path):
    """Parse a single HTML file and return its sections as ElementTree elements"""
    try:
        # First try with detected encoding
        encoding = detect_encoding(html_path)
        with open(html_path, 'r', encoding=encoding) as file:
            content = file.read()
    except UnicodeDecodeError:
        # If that fails, try with common xHTML encodings
        encodings = ['utf-16', 'utf-16le', 'utf-16be', 'latin1', 'iso-8859-1', 'cp1252']
        for enc in encodings:
            try:
                with open(html_path, 'r', encoding=enc) as file:
                    content = file.read()
                break
            except UnicodeDecodeError:
                continue
        else:
            print(f"Failed to decode {html_path} with any encoding")
            return []

    soup = BeautifulSoup(content, 'html.parser')
    sections = []
    section_heads = soup.find_all('h3', class_='section-head')
    
    for i, section_head in enumerate(section_heads):
        section = ET.Element("section")
        
        # Get section number and title
        text = section_head.get_text().strip()
        try:
            number, title = text.replace('§', '').split('.', 1)
        except ValueError:
            print(f"Warning: Could not parse section header: {text}")
            continue
            
        ET.SubElement(section, "number").text = number.strip()
        ET.SubElement(section, "title").text = title.strip()
        
        next_section = section_heads[i + 1] if i + 1 < len(section_heads) else None
        
        content_texts = []
        notes_list = []
        next_elem = section_head.find_next()
        
        while next_elem and next_elem != next_section:
            if next_elem.name == 'p':
                classes = next_elem.get('class', [])
                if any(cls.startswith('statutory-body') for cls in classes):
                    content_texts.append(next_elem.get_text().strip())
                elif 'source-credit' in classes:
                    source = ET.SubElement(section, "source")
                    source.text = next_elem.get_text().strip()
            elif next_elem.name == 'h4' and 'note-head' in next_elem.get('class', []):
                note_type = next_elem.get_text().strip()
                note_text, next_elem = get_note_content(next_elem, next_section)
                if note_text:
                    notes_list.append((note_type, note_text))
                continue
            
            next_elem = next_elem.find_next()
        
        if content_texts:
            content = ET.SubElement(section, "content")
            content.text = ' '.join(content_texts)
        
        if notes_list:
            notes_wrapper = ET.SubElement(section, "notes")
            for note_type, note_text in notes_list:
                note = ET.SubElement(notes_wrapper, "note")
                note.set('type', note_type)
                note.text = note_text
                
        sections.append(section)
    
    return sections

def parse_all_files(directory_path, output_file='complete_uscode.xml'):
    """Parse all HTML files in the directory and create a single XML file"""
    root = ET.Element("usCode")
    
    # Regular expression to extract title number from filename
    title_pattern = re.compile(r'PRELIMusc(\d+)\.htm')
    
    # Get all HTML files and sort them by title number
    html_files = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.htm'):
            match = title_pattern.match(filename)
            if match:
                title_num = int(match.group(1))
                html_files.append((title_num, filename))
    
    html_files.sort()  # Sort by title number
    
    # Process each file
    for title_num, filename in html_files:
        file_path = os.path.join(directory_path, filename)
        print(f"Processing Title {title_num}: {filename}")
        
        try:
            # Create title element
            title_elem = ET.SubElement(root, "title")
            title_elem.set("number", str(title_num))
            
            # Parse and add all sections from this file
            sections = parse_html_file(file_path)
            for section in sections:
                title_elem.append(section)
                
            print(f"Successfully processed Title {title_num}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    # Save the complete XML file
    try:
        tree = ET.ElementTree(root)
        ET.indent(tree, space="  ")
        tree.write(output_file, encoding='utf-8', xml_declaration=True)
        print(f"\nComplete US Code saved to {output_file}")
    except Exception as e:
        print(f"Error saving XML file: {str(e)}")

# Example usage
if __name__ == "__main__":
    directory_path = "/Users/vaibhavnakrani/usa_laws/htm_uscAll@118-157"
    parse_all_files(directory_path)

In [1]:
from dataclasses import dataclass
import xml.etree.ElementTree as ET

@dataclass
class SectionData:
    title: str  # The title number this section belongs to
    xml_content: str  # Raw XML of the section
    extracted_text: str  # Formatted text of the section

# Load and parse XML
xml_path = "/Users/vaibhavnakrani/usa_laws/uscode/complete_uscode.xml"
tree = ET.parse(xml_path)
root = tree.getroot()

# Process all sections
all_sections = []
title_section_counts = {}  # To keep track of sections per title

# Process each title and its sections
for title in root.findall(".//title"):
    title_num = title.get('number')
    sections = title.findall('section')
    title_section_counts[title_num] = len(sections)
    
    for section in sections:
        # Get raw XML for this section
        xml_string = ET.tostring(section, encoding='unicode', method='xml')
        
        # Extract text content
        number = section.find('number').text if section.find('number') is not None else "N/A"
        section_title = section.find('title').text if section.find('title') is not None else "N/A"
        content = section.find('content')
        content_text = content.text if content is not None else "No content available"
        source = section.find('source')
        source_text = source.text if source is not None else "No source available"
        
        # Format the extracted text
        extracted_text = (
            f"Title: {title_num} Section: {number} {section_title}\n"
            f"Content: {content_text}\n"
            f"Source: {source_text}"
        )
        
        # Create and store section data
        section_data = SectionData(
            title=title_num,
            xml_content=xml_string,
            extracted_text=extracted_text
        )
        all_sections.append(section_data)

In [2]:
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.vectorstores import LanceDB
import os
from dotenv import load_dotenv
load_dotenv()

voyage_api_key = os.getenv("VOYAGE_API_KEY")

embeddings = VoyageAIEmbeddings(
    voyage_api_key=voyage_api_key, model="voyage-3-lite", show_progress_bar = True, truncation = True
)

vector_store = LanceDB(embedding=embeddings,table_name="usa_code",uri = "/Users/vaibhavnakrani/usa_laws/uscode/lance",distance="cosine")

In [3]:
from langchain.schema import Document

documents = [
    Document(
        page_content=section.extracted_text,
        metadata={
            "title": section.title,
            "xml_content": section.xml_content,
        }
    ) 
    for section in all_sections
]

In [5]:
# Count tokens
# import voyageai

# vo = voyageai.Client()
# # This will automatically use the environment variable VOYAGE_API_KEY.
# # Alternatively, you can use vo = voyageai.Client(api_key="<your secret key>")

# texts = [ 
#     "The Mediterranean diet emphasizes fish, olive oil, and vegetables, believed to reduce chronic diseases.",
#     "Photosynthesis in plants converts light energy into glucose and produces essential oxygen."
# ]

# total_tokens = vo.count_tokens(all_text, model="voyage-3-lite")
# print(total_tokens)

voyage-law-2 = 41 million tokens  
voyage-3-lite = 38 mil  

In [None]:
# await vector_store.aadd_documents(documents)

In [5]:
def search_us_code(query: str, k: int = 4):
    """
    Search the US Code using semantic similarity.
    
    Args:
        query: The search query
        k: Number of results to return
        
    Returns:
        List of tuples containing (score, title, xml_content)
    """
    results = vector_store.similarity_search_with_score(query, k=k)
    
    formatted_results = []
    for doc, score in results:
        formatted_results.append({
            'score': score,
            'title': doc.metadata['title'],
            'xml_content': doc.metadata['xml_content'],
            'text': doc.page_content
        })
    
    return formatted_results

query = "What are the definitions related to National Parks?"
results = search_us_code(query)
results

[{'score': 0.776908278465271,
  'title': '54',
  'xml_content': '<section>\n      <number>100102</number>\n      <title>Definitions</title>\n      <source>(Pub. L. 113–287, §3, Dec. 19, 2014, 128 Stat. 3096.)</source>\n      <content>In this title: (1) Director.—The term "Director" means the Director of the National Park Service. (2) National park system.—The term "National Park System" means the areas of land and water described in section 100501 of this title. (3) Secretary.—The term "Secretary" means the Secretary of the Interior. (4) Service.—The term "Service" means the National Park Service. (5) System.—The term "System" means the National Park System. (6) System unit.—The term "System unit" means one of the areas described in section 100501 of this title.</content>\n      <notes>\n        <note type="Definitions">Pub. L. 114–289, §2, Dec. 16, 2016, 130 Stat. 1482, provided that: "In this Act [see Short Title of 2016 Amendment note set out under section 100101 of this title]: "(1

In [1]:
def search_us_code(query: str, k: int = 4):
    """
    Search the US Code using semantic similarity.
    
    Args:
        query: The search query
        k: Number of results to return
        
    Returns:
        List of dicts containing score, title, xml_content and text
    """
    # First, convert query to embeddings
    query_embedding = embeddings.embed_query(query)
    
    # Now search using the embedding vector
    results = vector_store.similarity_search_by_vector_with_relevance_scores(
        query_embedding, 
        k=k
    )
    
    formatted_results = []
    for doc, score in results:
        formatted_results.append({
            'score': score,
            'title': doc.metadata['title'],
            'xml_content': doc.metadata['xml_content'],
        })
    
    return formatted_results

# Test the function
query = "What are the definitions related to National Parks?"
results = search_us_code(query)

# Print results
for i, result in enumerate(results, 1):
    print(f"\nResult {i} (Score: {result['score']:.4f})")
    print(f"Title: {result['title']}")
    print("-" * 40)
    print(result['text'][:200] + "...")  # Print first 200 chars of the text

NameError: name 'embeddings' is not defined