In [None]:
# make the soup (load doc)

from bs4 import BeautifulSoup

# Specify file
file_path = 'sf_charter_04182024.html'

# Load the HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(content, 'lxml')

In [None]:
########################
### CHOP UP THE SOUP ###
########################
# remove content sections we don't want for now
# hackin and slashing, do not necessarily endorse

############################
### REMOVE CHARTER INTRO ###
############################

# Find the starting point div with title="Charter"
anchor_div = None
anchor_div = soup.find('a', {'title': 'Charter'}).find_parent('div')

# Find the highest-level parent <div> for the anchor
parent_div = anchor_div
while parent_div.name != 'div' or parent_div.find_parent('div'):
    parent_div = parent_div.find_parent()

# Remove parent div
parent_div.decompose()

##############################
### REMOVE MUNICIPAL CODES ###
##############################

# Find starting point div with id="rid-0-0-0-0-52172"
# Even more hacky because there's no unique title
anchor_div = None
anchor_div = soup.find('div', {'id': 'rid-0-0-0-52172'})

# Find the highest-level parent <div> for the anchor
parent_div = anchor_div
while parent_div.name != 'div' or parent_div.find_parent('div'):
    parent_div = parent_div.find_parent()

# Remove parent div
parent_div.decompose()

######################
### REMOVE PREFACE ###
######################

# Find starting point div with id="rid-0-0-0-8"
# Even more hacky because there's no unique title
anchor_div = None
anchor_div = soup.find('div', {'id': 'rid-0-0-0-8'})

# Find the highest-level parent <div> for the anchor
parent_div = anchor_div
while parent_div.name != 'div' or parent_div.find_parent('div'):
    parent_div = parent_div.find_parent()

# Remove parent div
parent_div.decompose()

##########################
### REMOVE APPENDICIES ###
##########################

# Find the starting point div with title 'Charter Appendices'
ancchor_div = None
anchor_div = soup.find('a', {'title': 'Charter Appendices'}).find_parent('div')

# Find the highest-level parent <div> for the anchor
parent_div = anchor_div
while parent_div.name != 'div' or parent_div.find_parent('div'):
    parent_div = parent_div.find_parent()

# Remove all subsequent div elements including the starting one
element = parent_div
while element:
    next_element = element.find_next_sibling('div')
    element.decompose()
    element = next_element

#########################
### REMOVE ARTICLE 18 ###
#########################

# Find the starting point div with title="Article XVIII"
anchor_div = None
anchor_div = soup.find('a', {'title': 'Article XVIII'}).find_parent('div')

# Find the highest-level parent <div> for the anchor
parent_div = anchor_div
while parent_div.name != 'div' or parent_div.find_parent('div'):
    parent_div = parent_div.find_parent()

# Remove all subsequent div elements including the starting one
element = parent_div
while element:
    next_element = element.find_next_sibling('div')
    element.decompose()
    element = next_element

In [None]:
#########################
### CLEAN UP THE SOUP ###
#########################
# Remove html components we don't want

# Remove the head tag
for element in soup.find_all('head'):
    element.decompose()

# Remove all elements with 'annotationdrawer' tag
for element in soup.find_all('annotationdrawer'):
    element.decompose()

# Remove all divs with class="clearfix"
for element in soup.find_all('div', class_='clearfix'):
    element.decompose()

# Find and delete the 'style' attributes in any element
elements_with_style = soup.find_all(style=True)
for element in elements_with_style:
    del element['style']

### Delete all tables ###
# this kills the table of contents for each Article which is what I want
# but it also kills 4 tables (16.116, 16.117, 16.118, 16.119) tough shit!

# Delete all divs with class="xsl-table"
for element in soup.find_all('div', class_='xsl-table'):
    element.decompose()

# Delete all <scrolltable> tags
for table in soup.find_all('scrolltable'):
    table.decompose()

### Remove links ###
# Replace web links with link text
for link in soup.find_all('a', class_='Web'):
    link.replace_with(link.text)

# Replace jump links with link text
for link in soup.find_all('link', class_="Jump"):
    link.decompose()

print(soup.prettify())

In [None]:
# find unique html class values 

def load_html_and_find_classes(soup):
    # Find all elements that have a 'class' attribute
    elements_with_class = soup.find_all(class_=True)

    # Extract unique classes from all elements
    unique_classes = set()
    for element in elements_with_class:
        # Adding all classes found in each element to the set (automatically handles uniqueness)
        if element['class']:
            unique_classes.update(element['class'])

    return unique_classes

unique_classes = load_html_and_find_classes(soup)
print("Unique HTML classes found:", unique_classes)

In [None]:
# find unique html title element values

# Find all elements with a 'title' attribute
elements_with_title = soup.find_all(attrs={"title": True})

# Extract unique title attribute values and do a half ass sort
unique_titles = sorted(set(element['title'] for element in elements_with_title))

# Count the total number of unique titles
total_unique_titles = len(unique_titles)

# Print unique title attribute values
print("Unique 'title' attribute values found:", unique_titles)
print("Total number of unique 'title' attribute values:", total_unique_titles)

In [None]:
# Remove unwanted titles
# - empty title
# - anything with Note

# init vars to catch and count empty and Note
titles_note = []
t_empty_count = 0

filtered_unique_titles = set()
for title in unique_titles:

    # Count and skip titles containing "Note" substring
    if "Note" in title:
        titles_note.append(title) 
        continue

    # Count and skip empty titles
    if title.strip() == "":
        t_empty_count += 1
        continue

    # Add good titles to unique set
    filtered_unique_titles.add(title)

# Print info
print("Unique good titles:", filtered_unique_titles)
print("Total unique good titles:", len(filtered_unique_titles))
print("Total empty titles:", t_empty_count)
print("Titles containing 'Note':", titles_note)
print("Total titles containing 'Note':", len(titles_note))