# Extracting Narrative from HTML to Markdown

This notebook explores extracting narrative elements from HTML files. It relies on having a downloaded IG folder structured full-ig/site containing input files. It outputs new markdown files in full-ig/markdown_output. Input and output directory paths need to be updated

In [13]:
from bs4 import BeautifulSoup
import os
from pathlib import Path
from IPython.display import display, HTML
from bs4.element import Tag
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_community.document_transformers import MarkdownifyTransformer
from urllib.parse import urlparse
from langchain.schema import Document
import re

## Langchain tool (Markdownify) to convert HTML to Markdown

Create a directory to store Markdown files.

In [14]:
# Directory where you want to save the markdown files
output_dir = '/Users/ceadams/Documents/onclaive/onclaive/full-ig/markdown_output'
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

Install necessary libraries if not already installed

In [15]:
# %pip install --upgrade --quiet  markdownify
# %pip install -U lxml
# %pip install unstructured

In [20]:
def convert_local_html_to_markdown(input_dir, output_dir="markdown_output", exclude_patterns=None):
    """
    Convert HTML files from a local directory to markdown, excluding files matching specific patterns.
    
    Args:
        input_dir (str): Path to the directory containing HTML files
        output_dir (str): Path to save the markdown files
        exclude_patterns (list): List of regex patterns to exclude
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Compile regex patterns for exclusion if provided
    if exclude_patterns:
        compiled_patterns = [re.compile(pattern) for pattern in exclude_patterns]
    else:
        compiled_patterns = [re.compile(r'\.ttl\.html$'), re.compile(r'\.xml\.html$')]
    
    # Get all HTML files in the directory
    html_files = []
    for file in Path(input_dir).glob('**/*.html'):
        file_str = str(file)
        
        # Check if the file should be excluded
        exclude = False
        for pattern in compiled_patterns:
            if pattern.search(file_str):
                exclude = True
                break
        
        if not exclude:
            html_files.append(file)
    
    print(f"Found {len(html_files)} HTML files to process")
    
    # Process each HTML file
    processed = 0
    errors = 0
    md_transformer = MarkdownifyTransformer()
    
    for i, html_file in enumerate(html_files):
        try:
            # Create relative path to preserve directory structure
            rel_path = html_file.relative_to(input_dir)
            output_path = Path(output_dir) / rel_path.with_suffix('.md')
            
            # Create parent directories if they don't exist
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Load HTML content
            with open(html_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            # Create a LangChain Document object with the HTML content
            doc = Document(page_content=html_content)
            
            # Transform to Markdown
            converted_docs = md_transformer.transform_documents([doc])
            
            # Write to output file
            if converted_docs and len(converted_docs) > 0:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(converted_docs[0].page_content)
                processed += 1
            
            # Print progress
            if (i + 1) % 10 == 0 or i == len(html_files) - 1:
                print(f"Processed {i + 1}/{len(html_files)} files")
                
        except Exception as e:
            print(f"Error processing {html_file}: {str(e)}")
            errors += 1
    
    print(f"Conversion complete. Successfully processed {processed} files. Encountered {errors} errors.")

In [21]:
#set input and output directories
input_directory = "/Users/ceadams/Documents/onclaive/onclaive/full-ig/site"
output_directory = "/Users/ceadams/Documents/onclaive/onclaive/full-ig/markdown_output"

# Define patterns to exclude
exclude_patterns = [
    r'\.ttl\.html$',  # Exclude files ending with .ttl.html
    r'\.xml\.html$',  # Exclude files ending with .xml.html
    r'\.json\.html$'  # Also exclude .json.html files
]

convert_local_html_to_markdown(
    input_dir=input_directory,
    output_dir=output_directory,
    exclude_patterns=exclude_patterns
)

Found 456 HTML files to process
Processed 10/456 files
Processed 20/456 files
Processed 30/456 files
Processed 40/456 files
Processed 50/456 files
Processed 60/456 files
Processed 70/456 files
Processed 80/456 files
Processed 90/456 files
Processed 100/456 files
Processed 110/456 files
Processed 120/456 files
Processed 130/456 files
Processed 140/456 files
Processed 150/456 files
Processed 160/456 files
Processed 170/456 files
Processed 180/456 files
Processed 190/456 files
Processed 200/456 files
Processed 210/456 files
Processed 220/456 files
Processed 230/456 files
Processed 240/456 files
Processed 250/456 files
Processed 260/456 files
Processed 270/456 files
Processed 280/456 files
Processed 290/456 files
Processed 300/456 files
Processed 310/456 files
Processed 320/456 files
Processed 330/456 files
Processed 340/456 files
Processed 350/456 files
Processed 360/456 files
Processed 370/456 files
Processed 380/456 files
Processed 390/456 files
Processed 400/456 files
Processed 410/456

Files should be stored in the `PlanNet/site/markdown_output` folder.