# Extracting Narrative from HTML to Markdown

This notebook explores extracting narrative elements from HTML files. It relies on having a downloaded IG folder structured full-ig/site containing input files. It outputs new markdown files in full-ig/markdown_output. Input and output directory paths need to be updated

In [27]:
from bs4 import BeautifulSoup
import os
from pathlib import Path
from IPython.display import display, HTML
from bs4.element import Tag
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_transformers import MarkdownifyTransformer
from urllib.parse import urlparse
from langchain.schema import Document
import re

## Langchain tool (Markdownify) to convert HTML to Markdown

Create a directory to store Markdown files.

In [28]:
# Directory where you want to save the markdown files
output_dir = '/Users/ceadams/Documents/onclaive/onclaive/us-core/markdown_output1'
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

Install necessary libraries if not already installed

In [29]:
# %pip install --upgrade --quiet  markdownify
# %pip install -U lxml
# %pip install unstructured

In [30]:
def convert_local_html_to_markdown(input_dir, output_dir="markdown_output", exclude_patterns=None):
    """
    Convert HTML files from a local directory to markdown, excluding files matching specific patterns.
    
    Args:
        input_dir (str): Path to the directory containing HTML files
        output_dir (str): Path to save the markdown files
        exclude_patterns (list): List of regex patterns to exclude
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Compile regex patterns for exclusion if provided
    if exclude_patterns:
        compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
    else:
        compiled_patterns = [re.compile(r'\.ttl\.html$'), re.compile(r'\.xml\.html$')]
    
    # Get all HTML files in the directory
    html_files = []
    for file in Path(input_dir).glob('**/*.html'):
        file_str = str(file)
        
        # Check if the file should be excluded
        exclude = False
        for pattern in compiled_patterns:
            if pattern.search(file_str):
                exclude = True
                break
        
        if not exclude:
            html_files.append(file)
    
    print(f"Found {len(html_files)} HTML files to process")
    
    # Process each HTML file
    processed = 0
    errors = 0
    md_transformer = MarkdownifyTransformer()
    
    for i, html_file in enumerate(html_files):
        try:
            # Create relative path to preserve directory structure
            rel_path = html_file.relative_to(input_dir)
            output_path = Path(output_dir) / rel_path.with_suffix('.md')
            
            # Create parent directories if they don't exist
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Load HTML content
            with open(html_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            # === NEW HEADER PROCESSING LOGIC ===
            try:
                soup = BeautifulSoup(html_content, 'html.parser')
                
                # Find the style tag with CSS heading prefix information
                style_tag = soup.find("style", attrs={'type': 'text/css'})
                
                if style_tag and style_tag.text:
                    # Look for heading prefix pattern in CSS
                    h_prefix_match = re.search(r'(h[0-9])\s*\{\s*--heading-prefix\s*:\s*"([0-9]+(?:\.[0-9]+)*)"', style_tag.text)
                    
                    if h_prefix_match:
                        # Extract the starting header level and numbering
                        starting_header_level = int(h_prefix_match.group(1)[1:])
                        prev_level = starting_header_level - 1
                        header_list = [int(x) for x in h_prefix_match.group(2).split('.')]
                        
                        # Process all headers in the document
                        for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
                            header_level = int(header.name[1:])
                            
                            # Adjust header numbering based on level
                            if header_level > prev_level:
                                # Going deeper - extend the list with zeros if needed
                                while len(header_list) < header_level:
                                    header_list.append(0)
                            elif header_level < prev_level:
                                # Going shallower - trim the list
                                header_list = header_list[:header_level]
                            
                            # Increment the counter at current level
                            if len(header_list) >= header_level:
                                header_list[header_level - 1] += 1
                            else:
                                header_list.append(1)
                            
                            # Create the numbered header text
                            header_number = ".".join([str(x) for x in header_list[:header_level]])
                            markdown_header = " ".join([
                                "#" * header_level, 
                                header_number, 
                                header.get_text(strip=True)
                            ])
                            
                            # Replace the original header with numbered version
                            header.replace_with(markdown_header)
                            prev_level = header_level
                
                # Use the processed soup content
                processed_content = str(soup)
                
            except Exception as header_error:
                print(f"Warning: Header processing failed for {html_file}: {str(header_error)}")
                print("Falling back to original HTML content...")
                processed_content = html_content
            
            # === END HEADER PROCESSING LOGIC ===
            
            # Create a LangChain Document object with the processed HTML content
            doc = Document(page_content=processed_content)
            
            # Transform to Markdown
            converted_docs = md_transformer.transform_documents([doc])
            
            # Write to output file
            if converted_docs and len(converted_docs) > 0:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(converted_docs[0].page_content)
                processed += 1
            
            # Print progress
            if (i + 1) % 10 == 0 or i == len(html_files) - 1:
                print(f"Processed {i + 1}/{len(html_files)} files")
                
        except Exception as e:
            print(f"Error processing {html_file}: {str(e)}")
            errors += 1
    
    print(f"Conversion complete. Successfully processed {processed} files. Encountered {errors} errors.")

In [32]:
#set input and output directories
input_directory = "/Users/ceadams/Documents/onclaive/onclaive/us-core/site"
output_directory = "/Users/ceadams/Documents/onclaive/onclaive/us-core/markdown_output1"

# Define patterns to exclude
exclude_patterns = [
    r'\.ttl\.html$',  # Exclude files ending with .ttl.html
    r'\.xml\.html$',  # Exclude files ending with .xml.html
    r'\.json\.html$',  # Also exclude .json.html files
    r'\.change\.history\.html$',
    r'\.profile\.history\.html$',
    r'\-example\.html$',
    r'\-examples\.html$'
]

convert_local_html_to_markdown(
    input_dir=input_directory,
    output_dir=output_directory,
    exclude_patterns=exclude_patterns
)

Found 2476 HTML files to process
Processed 10/2476 files
Processed 20/2476 files
Processed 30/2476 files
Processed 40/2476 files
Processed 50/2476 files
Processed 60/2476 files
Processed 70/2476 files
Processed 80/2476 files
Processed 90/2476 files
Processed 100/2476 files
Processed 110/2476 files
Processed 120/2476 files
Processed 130/2476 files
Processed 140/2476 files
Processed 150/2476 files
Processed 160/2476 files
Processed 170/2476 files
Processed 180/2476 files
Processed 190/2476 files
Processed 200/2476 files
Processed 210/2476 files
Processed 220/2476 files
Processed 230/2476 files
Processed 240/2476 files
Processed 250/2476 files
Processed 260/2476 files
Processed 270/2476 files
Processed 280/2476 files
Processed 290/2476 files
Processed 300/2476 files
Processed 310/2476 files
Processed 320/2476 files
Processed 330/2476 files
Processed 340/2476 files
Processed 350/2476 files
Processed 360/2476 files
Processed 370/2476 files
Processed 380/2476 files
Processed 390/2476 files
P

Files should be stored in the `PlanNet/site/markdown_output` folder.