# Convert HTML to Markdown

This notebook converts raw HTML files from `data/raw` to Markdown format using docling and saves them to `data/processed`.

In [2]:
from pathlib import Path
from docling.document_converter import DocumentConverter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define directories
raw_dir = Path("../data/raw/health_education")
processed_dir = Path("../data/processed/health_education")
processed_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# Initialize docling converter
converter = DocumentConverter()

In [5]:
def convert_html_to_markdown(html_path, output_path):
    """
    Converts an HTML file to Markdown using docling.
    
    Args:
        html_path: Path to the input HTML file
        output_path: Path where to save the Markdown file
    """
    try:
        print(f"Converting: {html_path}")
        
        # Convert HTML to document
        result = converter.convert(str(html_path))
        
        # Get markdown content
        markdown_content = result.document.export_to_markdown()
        
        # Create output directory if it doesn't exist
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Save markdown file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"✓ Saved to: {output_path}")
        print(f"  Size: {len(markdown_content)} characters\n")
        return True
    except Exception as e:
        print(f"✗ Error converting {html_path}: {e}\n")
        import traceback
        traceback.print_exc()
        return False

In [6]:
# Find all HTML files in the raw directory
html_files = list(raw_dir.rglob("*.html"))
print(f"Found {len(html_files)} HTML files to convert\n")

Found 4 HTML files to convert



In [7]:
# Convert all HTML files to Markdown
results = []

for html_path in html_files:
    # Calculate relative path from raw_dir
    relative_path = html_path.relative_to(raw_dir)
    
    # Create corresponding output path with .md extension
    output_path = processed_dir / relative_path.with_suffix('.md')
    
    success = convert_html_to_markdown(html_path, output_path)
    
    results.append({
        "input": str(html_path),
        "output": str(output_path),
        "success": success
    })

2025-11-15 16:27:59,935 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-15 16:28:00,071 - INFO - Going to convert document batch...
2025-11-15 16:28:00,073 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-11-15 16:28:00,088 - INFO - Loading plugin 'docling_defaults'
2025-11-15 16:28:00,094 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-15 16:28:00,096 - INFO - Processing document hypertension_medlineplus_overview.html


Converting: ../data/raw/health_education/conditions/hypertension/hypertension_medlineplus_overview.html


2025-11-15 16:28:00,254 - INFO - Finished converting document hypertension_medlineplus_overview.html in 0.32 sec.
2025-11-15 16:28:00,458 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-15 16:28:00,551 - INFO - Going to convert document batch...
2025-11-15 16:28:00,553 - INFO - Processing document diabetes_medlineplus_overview.html


✓ Saved to: ../data/processed/health_education/conditions/hypertension/hypertension_medlineplus_overview.md
  Size: 25783 characters

Converting: ../data/raw/health_education/conditions/diabetes/diabetes_medlineplus_overview.html


2025-11-15 16:28:00,713 - INFO - Finished converting document diabetes_medlineplus_overview.html in 0.26 sec.
2025-11-15 16:28:00,917 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-15 16:28:00,973 - INFO - Going to convert document batch...
2025-11-15 16:28:00,974 - INFO - Processing document hypertension_medlineplus_treatment.html
2025-11-15 16:28:01,052 - INFO - Finished converting document hypertension_medlineplus_treatment.html in 0.14 sec.


✓ Saved to: ../data/processed/health_education/conditions/diabetes/diabetes_medlineplus_overview.md
  Size: 28650 characters

Converting: ../data/raw/health_education/treatments/hypertension/hypertension_medlineplus_treatment.html


2025-11-15 16:28:01,133 - INFO - detected formats: [<InputFormat.HTML: 'html'>]
2025-11-15 16:28:01,191 - INFO - Going to convert document batch...
2025-11-15 16:28:01,194 - INFO - Processing document diabetes_medlineplus_treatment.html
2025-11-15 16:28:01,290 - INFO - Finished converting document diabetes_medlineplus_treatment.html in 0.16 sec.


✓ Saved to: ../data/processed/health_education/treatments/hypertension/hypertension_medlineplus_treatment.md
  Size: 12851 characters

Converting: ../data/raw/health_education/treatments/diabetes/diabetes_medlineplus_treatment.html
✓ Saved to: ../data/processed/health_education/treatments/diabetes/diabetes_medlineplus_treatment.md
  Size: 13890 characters



In [8]:
# Summary of conversions
print("=" * 60)
print("CONVERSION SUMMARY")
print("=" * 60)

successful = sum(1 for r in results if r["success"])
total = len(results)

print(f"\nTotal: {total} files")
print(f"Successful: {successful}")
print(f"Failed: {total - successful}")

print("\n" + "-" * 60)
print("Converted files:")
print("-" * 60)

for result in results:
    status = "✓" if result["success"] else "✗"
    print(f"{status} {result['output']}")

CONVERSION SUMMARY

Total: 4 files
Successful: 4
Failed: 0

------------------------------------------------------------
Converted files:
------------------------------------------------------------
✓ ../data/processed/health_education/conditions/hypertension/hypertension_medlineplus_overview.md
✓ ../data/processed/health_education/conditions/diabetes/diabetes_medlineplus_overview.md
✓ ../data/processed/health_education/treatments/hypertension/hypertension_medlineplus_treatment.md
✓ ../data/processed/health_education/treatments/diabetes/diabetes_medlineplus_treatment.md


In [9]:
# Verify that markdown files were created correctly
print("\n" + "=" * 60)
print("FILE VERIFICATION")
print("=" * 60)

for result in results:
    if result["success"]:
        output_path = Path(result["output"])
        if output_path.exists():
            size = output_path.stat().st_size
            print(f"✓ {output_path.name}: {size:,} bytes")
        else:
            print(f"✗ {output_path.name}: File not found")


FILE VERIFICATION
✓ hypertension_medlineplus_overview.md: 25,783 bytes
✓ diabetes_medlineplus_overview.md: 28,650 bytes
✓ hypertension_medlineplus_treatment.md: 12,851 bytes
✓ diabetes_medlineplus_treatment.md: 13,890 bytes
