# Extracting items from HTML

This notebook extracts elements from HTML files and writes the contents into markdown files. It uses BeautifulSoup to parse the HTML and extract the desired including text, tables, and list elements as well as includes image placeholders. The markdowns are created so that narrative content within the IG can be preserved and concatanated to serve as context for the LLMs. The content is saved in a single markdown file for a specific HTML file. 

In [8]:
from bs4 import BeautifulSoup
import os
from pathlib import Path
from IPython.display import display, HTML
from bs4.element import Tag

We define a class `ContentExtractor` to to extract content from HTML files. This class has methods to extract text, tables, and list elements from the HTML. The extracted content is then formatted as markdown and written to a file. The class has methods that also check to see if elements have been processed to avoid duplicates. From images, there is a method (`_extract_images`) to pull the src and alt text and format as markdown image with additional source info.

In [9]:
class ContextExtractor:
    def __init__(self):
        """Initialize the context extractor"""
        self.processed_elements = set()
        
    def _has_been_processed(self, element):
        """Check if an element has already been processed"""
        if not isinstance(element, Tag):
            return False
        return element.get('data-processed') == 'true'
    
    def _mark_processed(self, element):
        """Mark an element as processed"""
        if isinstance(element, Tag):
            element['data-processed'] = 'true'
            self.processed_elements.add(element)

    def _extract_images(self, element):
        """
        Extract image information including src and alt text
        
        Args:
            element: BeautifulSoup element containing images
        Returns:
            list: Formatted image information in Markdown
        """
        if self._has_been_processed(element):
            return []
            
        images = []
        for img in element.find_all('img', recursive=False):
            src = img.get('src', '')
            alt = img.get('alt', '')
            if src:
                # Format as Markdown image with additional source info
                images.append(f"![{alt}]({src})")
                images.append(f"*Image source: {src}*")
                images.append(f"*Image description: {alt}*")
                images.append("")  # Add blank line after each image
                
        self._mark_processed(element)
        return images

    def _extract_list_items(self, list_element, level=0, parent_type=None):
        """Extract list items with improved nested list handling"""
        if self._has_been_processed(list_element):
            return []
            
        items = []
        for item in list_element.find_all('li', recursive=False):
            if not self._has_been_processed(item):
                # Get direct text content of the li element (excluding nested list text)
                item_text = ''
                for content in item.children:
                    if isinstance(content, Tag):
                        if content.name not in ['ul', 'ol']:
                            if content.name == 'img':
                                # Handle images within list items
                                image_info = self._extract_images(content.parent)
                                items.extend([f"{'    ' * level}{line}" for line in image_info])
                            else:
                                item_text += content.get_text(strip=True) + ' '
                    else:
                        item_text += content.strip() + ' '
                item_text = item_text.strip()
                
                # Format the list item
                prefix = '    ' * level
                if list_element.name == 'ol':
                    items.append(f"{prefix}1. {item_text}")
                else:
                    items.append(f"{prefix}- {item_text}")
                
                # Handle nested lists
                nested_lists = item.find_all(['ul', 'ol'], recursive=False)
                for nested_list in nested_lists:
                    nested_items = []
                    for nested_item in nested_list.find_all('li', recursive=False):
                        nested_text = nested_item.get_text(strip=True)
                        if nested_list.name == 'ol':
                            nested_items.append(f"{prefix}    * {nested_text}")
                        else:
                            nested_items.append(f"{prefix}    * {nested_text}")
                    items.extend(nested_items)
                
                self._mark_processed(item)
        
        self._mark_processed(list_element)
        return items

    def _extract_table(self, table):
        """Extract table content in Markdown format"""
        if self._has_been_processed(table):
            return ""
            
        rows = []
        headers = []
        
        header_row = table.find('thead') or table.find('tr')
        if header_row:
            headers = [cell.get_text(strip=True) for cell in header_row.find_all(['th', 'td'])]
        
        for row in table.find_all('tr')[1:] if headers else table.find_all('tr'):
            row_data = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
            if any(row_data):
                rows.append(row_data)
        
        table_str = []
        if headers:
            table_str.append("| " + " | ".join(headers) + " |")
            table_str.append("|" + "|".join([" --- " for _ in headers]) + "|")
        
        for row in rows:
            if headers:
                row.extend([''] * (len(headers) - len(row)))
            table_str.append("| " + " | ".join(row) + " |")
        
        self._mark_processed(table)
        return "\n".join(table_str)

    def extract_context(self, html_content):
        """Extract content with improved list and image handling"""
        soup = BeautifulSoup(html_content, 'html.parser')
        self.processed_elements.clear()
        context_elements = []
        
        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer']):
            script.decompose()
        
        # Process headers and their content
        for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            if not self._has_been_processed(header):
                level = int(header.name[1])
                header_text = header.get_text().strip()
                
                if header_text:
                    context_elements.append(f"\n{'#' * level} {header_text}\n")
                    self._mark_processed(header)
                    
                    # Process content until next header
                    next_element = header.find_next()
                    while next_element and not next_element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        if not self._has_been_processed(next_element):
                            if next_element.name == 'p':
                                # Handle images within paragraphs
                                if next_element.find('img'):
                                    image_info = self._extract_images(next_element)
                                    context_elements.extend(image_info)
                                else:
                                    text = next_element.get_text().strip()
                                    if text:
                                        context_elements.append(text)
                                        context_elements.append("")
                            elif next_element.name in ['ul', 'ol']:
                                list_items = self._extract_list_items(next_element)
                                if list_items:
                                    context_elements.extend(list_items)
                                    context_elements.append("")
                            elif next_element.name == 'table':
                                table_content = self._extract_table(next_element)
                                if table_content:
                                    context_elements.append(table_content)
                                    context_elements.append("")
                        
                        next_element = next_element.find_next()

        # Process any remaining top-level images
        for img_container in soup.find_all('p'):
            if img_container.find('img') and not self._has_been_processed(img_container):
                image_info = self._extract_images(img_container)
                if image_info:
                    context_elements.extend(image_info)
        
        # Clean up repeated empty lines
        cleaned_elements = []
        prev_empty = False
        for element in context_elements:
            if element.strip() == "":
                if not prev_empty:
                    cleaned_elements.append(element)
                    prev_empty = True
            else:
                cleaned_elements.append(element)
                prev_empty = False
        
        return cleaned_elements

    def save_context(self, context_elements, output_file):
        """Save context to Markdown file"""
        with open(output_file, 'w', encoding='utf-8') as f:
            for element in context_elements:
                f.write(f"{element}\n")

    def process_html_file(self, input_file, output_file):
        """Process HTML file to Markdown"""
        try:
            output_file = os.path.splitext(output_file)[0] + '.md'
            
            with open(input_file, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            context_elements = self.extract_context(html_content)
            self.save_context(context_elements, output_file)
            self._display_summary(input_file, output_file, len(context_elements))
            
            return context_elements
            
        except Exception as e:
            display(HTML(f'<div style="color: red;">Error processing file: {str(e)}</div>'))
            return []

    def process_directory(self, input_dir, output_dir):
        """Process directory of HTML files"""
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        for file in Path(input_dir).glob('*.html'):
            output_file = Path(output_dir) / f"{file.stem}.md"
            self.process_html_file(str(file), str(output_file))

    def _display_summary(self, input_file, output_file, num_elements):
        """Display processing summary"""
        summary_html = f"""
        <div style="background-color: #f0f0f0; padding: 10px; border-radius: 5px; margin: 10px 0;">
            <p><strong>Processed file:</strong> {os.path.basename(input_file)}</p>
            <p><strong>Output saved to:</strong> {os.path.basename(output_file)}</p>
            <p><strong>Extracted elements:</strong> {num_elements}</p>
        </div>
        """
        display(HTML(summary_html))

`ContentExtractor` is called by creating an instance of the class and then calling the `process_html_file` or `process_directory` method. The `process_html_file` method takes two arguments: the path to the input HTML file and the desired name of the output Markdown file. The `process_directory` method takes two arguments: the path to the input directory containing HTML files and the path to the output directory where the Markdown files will be saved.

In [10]:
extractor = ContextExtractor()
context = extractor.process_html_file(
    input_file='/Users/amathur/Documents/ONCLAIVE/onclaive-aanchalwip/PlanNet/site/index.html',
    output_file='context'
)

In [11]:
for i, element in enumerate(context[:5]):  # Show first 5 elements
    print(f"\nElement {i+1}:")
    print(element[:200] + "..." if len(element) > 200 else element)


Element 1:

## Home


Element 2:
| Official URL:http://hl7.org/fhir/us/davinci-pdex-plan-net/ImplementationGuide/hl7.fhir.us.davinci-pdex-plan-net | Version:1.1.0 |
| --- | --- |
| Active
          
            as of 2022-04-04 | Com...

Element 3:


Element 4:

### PDEX Payer Network Implementation Guide


Element 5:

#### Introduction

