In [None]:
FILE_NAME='data/html/sample-html.html'


# methods for HTML parsing and chunking 
1. BeautifulSoup
2. unstrunctured.io
3. langchain 

# BeautifulSoup

In [None]:
from bs4 import BeautifulSoup
import re

with open(FILE_NAME, 'r', encoding='utf-8') as f:
        html= f.read()    
        soup = BeautifulSoup(html, 'html.parser')
        print(soup.prettify())

# unstrunctured.io

In [None]:
from unstructured.partition.auto import partition
elements = partition(FILE_NAME)
print(elements)

In [None]:
print("Number of elements: ", len(elements))
for i, element in enumerate(elements):             
    if element.category == 'Table':
        chunk_text = element.metadata.text_as_html
    else:
        if element.category == 'Title':
            chunk_text = "# "+ element.text
        else:
            chunk_text = element.text 
    print(f'element {i} ({element.category}): Chunk len ({len(chunk_text)}) {chunk_text[:100]}...') 
    

In [None]:
MAX_CHARACTERS = 1500
NEW_AFTER_N_CHARS = 2000
COMBINE_UNDER_N_CHARS = 100

from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean

chunks = chunk_by_title(elements, multipage_sections=True, max_characters=MAX_CHARACTERS, new_after_n_chars=NEW_AFTER_N_CHARS, combine_text_under_n_chars=COMBINE_UNDER_N_CHARS)  

out_text = ''
chunks_list=[]
# Complete and write chunks
for i, chunk in enumerate(chunks): 
            
    if chunk.category == 'Table':
        chunk_text = chunk.metadata.text_as_html
    else:
        chunk_text = chunk.text        
    chunks_list.append(chunk_text)
    print(f'Chunk {i} ({chunk.category}): Chunk len ({len(chunk_text)}) {chunk_text[:100]}...') 
    out_text += chunk_text
                      
cleaned_text = clean(out_text, extra_whitespace=True)
print(cleaned_text)
print(f'Number of chunks: {len(chunks)}')

# langchain

## HTML Loader

In [None]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
loader = UnstructuredHTMLLoader(FILE_NAME)
data = loader.load()
print (data)

## Loading HTML with BeautifulSoup4

In [None]:
from langchain_community.document_loaders import BSHTMLLoader

loader_bs4 = BSHTMLLoader(FILE_NAME, open_encoding='utf-8')
data_bs4 = loader_bs4.load()
data_bs4

# Loading HTML with AzureAIDocumentIntelligenceLoader

In [None]:
#%pip install --upgrade --quiet  langchain langchain-community azure-ai-documentintelligence

import os
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from dotenv import load_dotenv

load_dotenv()

endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT") 
key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY") 
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint, api_key=key, file_path=FILE_NAME, api_model="prebuilt-layout"
)

documents = loader.load()
print(documents)