# Website Content Extraction

This notebook extracts all content from the old website HTML and organizes it into:
- Individual markdown files per section
- Downloaded images organized by section folders

## Sections to Extract:
1. Hero section
2. Before/After with handles
3. Before/After stats only
4. Client testimonials/results
5. How it works
6. FAQ
7. Pricing/Packages


## Setup & Dependencies


In [1]:
# Import required libraries
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import urllib.parse
import re
import json
from collections import defaultdict

# Setup directories
base_dir = Path('extracted_content')
images_dir = base_dir / 'images'

# Create directory structure
base_dir.mkdir(exist_ok=True)
images_dir.mkdir(exist_ok=True)

# Image tracking
image_manifest = defaultdict(list)

print("✓ Directories created")
print(f"  - Base: {base_dir.absolute()}")
print(f"  - Images: {images_dir.absolute()}")


✓ Directories created
  - Base: /Users/marclamy/Documents - Local/Code/external/pblaunch/python/extracted_content
  - Images: /Users/marclamy/Documents - Local/Code/external/pblaunch/python/extracted_content/images




## Load & Parse HTML


In [2]:
# Load the HTML file
html_file = Path('old-website.html')

with open(html_file, 'r', encoding='utf-8') as f:
    html_content = f.read()

# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')

print(f"✓ HTML loaded and parsed")
print(f"  - File size: {len(html_content):,} characters")
print(f"  - Total elements: {len(soup.find_all())}")


✓ HTML loaded and parsed
  - File size: 640,176 characters
  - Total elements: 1770


## Helper Functions


In [3]:
def download_image(url, section_name, img_index):
    """Download an image from URL and save it to the appropriate section folder"""
    if not url or url.startswith('data:'):
        # Skip data URLs (base64 encoded images)
        return None
    
    try:
        # Create section folder
        section_folder = images_dir / section_name
        section_folder.mkdir(exist_ok=True)
        
        # Get file extension from URL or default to jpg
        parsed = urllib.parse.urlparse(url)
        ext = Path(parsed.path).suffix or '.jpg'
        if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']:
            ext = '.jpg'
        
        # Create filename
        filename = f"{section_name}-{img_index}{ext}"
        filepath = section_folder / filename
        
        # Download if URL is complete
        if url.startswith('http'):
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            
            rel_path = f"images/{section_name}/{filename}"
            image_manifest[section_name].append(rel_path)
            return rel_path
        else:
            print(f"  ⚠ Skipping relative URL: {url[:100]}")
            return None
            
    except Exception as e:
        print(f"  ✗ Error downloading {url[:100]}: {str(e)}")
        return None

def extract_text(element):
    """Extract and clean text from an element"""
    if not element:
        return ""
    return element.get_text(strip=True, separator=' ')

def find_section_by_text(soup, text_snippet):
    """Find a section containing specific text"""
    elements = soup.find_all(string=re.compile(text_snippet, re.IGNORECASE))
    if elements:
        # Return the parent section
        for elem in elements:
            parent = elem.find_parent(['div', 'section'])
            if parent:
                return parent
    return None

print("✓ Helper functions defined")


✓ Helper functions defined


## Extract Hero Section


In [4]:
print("Extracting Hero Section...")

# Find hero section by looking for main heading
hero_section = find_section_by_text(soup, "GROW & SELL ON SOCIAL MEDIA")

hero_content = {
    'heading': '',
    'subheading': '',
    'description': '',
    'cta': '',
    'images': []
}

if hero_section:
    # Extract headings
    headings = hero_section.find_all(['h1', 'h2', 'h3'])
    if headings:
        hero_content['heading'] = extract_text(headings[0])
        if len(headings) > 1:
            hero_content['subheading'] = extract_text(headings[1])
    
    # Extract description/paragraph text
    paragraphs = hero_section.find_all('p')
    if paragraphs:
        hero_content['description'] = '\n\n'.join([extract_text(p) for p in paragraphs if extract_text(p)])
    
    # Extract CTA buttons/links
    buttons = hero_section.find_all(['a', 'button'])
    cta_texts = [extract_text(btn) for btn in buttons if extract_text(btn)]
    if cta_texts:
        hero_content['cta'] = cta_texts[0]
    
    # Download images
    images = hero_section.find_all('img')
    print(f"  Found {len(images)} images")
    for idx, img in enumerate(images, 1):
        img_url = img.get('src') or img.get('data-src')
        if img_url:
            downloaded = download_image(img_url, 'hero', idx)
            if downloaded:
                hero_content['images'].append(downloaded)
                print(f"  ✓ Downloaded: {downloaded}")

print(f"✓ Hero section extracted")
print(f"  - Heading: {hero_content['heading'][:50]}..." if hero_content['heading'] else "  - No heading found")


Extracting Hero Section...
  Found 0 images
✓ Hero section extracted
  - Heading: GROW & SELL ON SOCIAL MEDIA...


## Extract Before/After Sections


In [5]:
print("Extracting Before/After Sections...")

# Look for sections with "CLIENT BEFORE & AFTER" or similar text
before_after_section = find_section_by_text(soup, "CLIENT BEFORE.*AFTER")

before_after_handles = {
    'title': '',
    'items': [],
    'images': []
}

before_after_stats = {
    'title': '',
    'items': [],
    'images': []
}

# Find all images that look like Instagram screenshots
all_images = soup.find_all('img')
instagram_images = []

print(f"  Scanning {len(all_images)} total images for Instagram screenshots...")

# Download all images from potential before/after sections
img_counter_handles = 1
img_counter_stats = 1

for img in all_images:
    img_url = img.get('src') or img.get('data-src')
    if not img_url:
        continue
    
    # Check if this looks like an Instagram screenshot or profile image
    # Look at surrounding text for context
    parent_text = extract_text(img.find_parent())
    
    # Heuristic: if near text with "followers", "posts", "following", it's likely before/after
    if any(keyword in parent_text.lower() for keyword in ['follower', 'posts', 'following', '@']):
        # If it has @ symbol or handle visible, it's the "with handles" type
        if '@' in parent_text or any(word.startswith('@') for word in parent_text.split()):
            downloaded = download_image(img_url, 'before-after-handles', img_counter_handles)
            if downloaded:
                before_after_handles['images'].append(downloaded)
                print(f"  ✓ Before/After (with handles): {downloaded}")
                img_counter_handles += 1
        else:
            # Just stats, no handle
            downloaded = download_image(img_url, 'before-after-stats', img_counter_stats)
            if downloaded:
                before_after_stats['images'].append(downloaded)
                print(f"  ✓ Before/After (stats only): {downloaded}")
                img_counter_stats += 1

# If we found a specific section, extract its title
if before_after_section:
    section_title = before_after_section.find(['h1', 'h2', 'h3'])
    if section_title:
        before_after_handles['title'] = extract_text(section_title)
        before_after_stats['title'] = extract_text(section_title)

print(f"✓ Before/After sections extracted")
print(f"  - With handles: {len(before_after_handles['images'])} images")
print(f"  - Stats only: {len(before_after_stats['images'])} images")


Extracting Before/After Sections...
  Scanning 84 total images for Instagram screenshots...
✓ Before/After sections extracted
  - With handles: 0 images
  - Stats only: 0 images


## Extract Client Results & Testimonials


In [6]:
print("Extracting Client Results & Testimonials...")

# Look for section with "LEAD RESULTS" or testimonial indicators
client_results_section = find_section_by_text(soup, "CLIENT.*LEAD.*RESULTS")
if not client_results_section:
    client_results_section = find_section_by_text(soup, "TESTIMONIAL")

client_results = {
    'title': '',
    'testimonials': [],
    'images': []
}

# Extract title
if client_results_section:
    title_elem = client_results_section.find(['h1', 'h2', 'h3'])
    if title_elem:
        client_results['title'] = extract_text(title_elem)
    
    # Find testimonial text blocks
    # Look for text blocks that look like testimonials (longer paragraphs, quotes, etc)
    text_blocks = client_results_section.find_all(['p', 'blockquote', 'div'])
    for block in text_blocks:
        text = extract_text(block)
        # If it's a substantial text block (testimonials tend to be longer)
        if len(text) > 50 and any(keyword in text.lower() for keyword in ['thank', 'appreciate', 'great', 'helped', 'working', 'business']):
            client_results['testimonials'].append(text)
    
    # Download testimonial screenshot images
    images = client_results_section.find_all('img')
    print(f"  Found {len(images)} images in testimonials section")
    for idx, img in enumerate(images, 1):
        img_url = img.get('src') or img.get('data-src')
        if img_url:
            downloaded = download_image(img_url, 'client-results', idx)
            if downloaded:
                client_results['images'].append(downloaded)
                print(f"  ✓ Downloaded testimonial: {downloaded}")

# Also scan for testimonial-looking images throughout the page
# (screenshots of messages, comments, etc.)
print("  Scanning for additional testimonial screenshots...")
for img in all_images:
    img_url = img.get('src') or img.get('data-src')
    if not img_url:
        continue
    
    # Check surrounding context for testimonial keywords
    parent = img.find_parent()
    if parent:
        parent_text = extract_text(parent).lower()
        # Look for message/testimonial indicators
        if any(keyword in parent_text for keyword in ['sms', 'message', 'dm', 'thank you', 'appreciation']):
            # Avoid duplicates
            section_name = 'client-results'
            if img_url not in [img.get('src') or img.get('data-src') for img in (client_results_section.find_all('img') if client_results_section else [])]:
                idx = len(client_results['images']) + 1
                downloaded = download_image(img_url, section_name, idx)
                if downloaded and downloaded not in client_results['images']:
                    client_results['images'].append(downloaded)
                    print(f"  ✓ Additional testimonial: {downloaded}")

print(f"✓ Client results extracted")
print(f"  - Text testimonials: {len(client_results['testimonials'])}")
print(f"  - Image testimonials: {len(client_results['images'])}")


Extracting Client Results & Testimonials...
  Found 0 images in testimonials section
  Scanning for additional testimonial screenshots...
✓ Client results extracted
  - Text testimonials: 0
  - Image testimonials: 0


In [7]:
print("Extracting How It Works Section...")

# Look for "how it works" or process section
how_it_works_section = find_section_by_text(soup, "HOW.*IT.*WORKS")
if not how_it_works_section:
    how_it_works_section = find_section_by_text(soup, "OUR.*PROCESS")
if not how_it_works_section:
    how_it_works_section = find_section_by_text(soup, "WHAT.*WE.*DO")

how_it_works = {
    'title': '',
    'steps': [],
    'description': '',
    'images': []
}

if how_it_works_section:
    # Extract title
    title_elem = how_it_works_section.find(['h1', 'h2', 'h3'])
    if title_elem:
        how_it_works['title'] = extract_text(title_elem)
    
    # Extract step-by-step content
    # Look for numbered items, list items, or sections with step indicators
    lists = how_it_works_section.find_all(['ol', 'ul'])
    for lst in lists:
        items = lst.find_all('li')
        for item in items:
            step_text = extract_text(item)
            if step_text:
                how_it_works['steps'].append(step_text)
    
    # If no list, look for headings + paragraphs pattern
    if not how_it_works['steps']:
        all_headings = how_it_works_section.find_all(['h3', 'h4', 'h5'])
        for heading in all_headings:
            heading_text = extract_text(heading)
            # Find following paragraph
            next_sibling = heading.find_next_sibling(['p', 'div'])
            if next_sibling:
                desc = extract_text(next_sibling)
                if desc:
                    how_it_works['steps'].append(f"{heading_text}: {desc}")
                else:
                    how_it_works['steps'].append(heading_text)
            else:
                how_it_works['steps'].append(heading_text)
    
    # Extract general description
    paragraphs = how_it_works_section.find_all('p')
    desc_parts = [extract_text(p) for p in paragraphs if extract_text(p) and len(extract_text(p)) > 30]
    how_it_works['description'] = '\n\n'.join(desc_parts)
    
    # Download images
    images = how_it_works_section.find_all('img')
    print(f"  Found {len(images)} images")
    for idx, img in enumerate(images, 1):
        img_url = img.get('src') or img.get('data-src')
        if img_url:
            downloaded = download_image(img_url, 'how-it-works', idx)
            if downloaded:
                how_it_works['images'].append(downloaded)
                print(f"  ✓ Downloaded: {downloaded}")

print(f"✓ How It Works extracted")
print(f"  - Steps: {len(how_it_works['steps'])}")
print(f"  - Images: {len(how_it_works['images'])}")


Extracting How It Works Section...
  Found 0 images
✓ How It Works extracted
  - Steps: 0
  - Images: 0


In [8]:
print("Extracting FAQ Section...")

# Look for FAQ section
faq_section = find_section_by_text(soup, "FAQ")
if not faq_section:
    faq_section = find_section_by_text(soup, "FREQUENTLY.*ASKED")
if not faq_section:
    faq_section = find_section_by_text(soup, "QUESTIONS")

faq = {
    'title': '',
    'items': [],
    'images': []
}

if faq_section:
    # Extract title
    title_elem = faq_section.find(['h1', 'h2', 'h3'])
    if title_elem:
        faq['title'] = extract_text(title_elem)
    
    # Extract Q&A pairs
    # Look for accordion/collapsible patterns, dt/dd pairs, or heading+paragraph
    
    # Method 1: dt/dd (definition list)
    dts = faq_section.find_all('dt')
    dds = faq_section.find_all('dd')
    if dts and dds:
        for dt, dd in zip(dts, dds):
            question = extract_text(dt)
            answer = extract_text(dd)
            if question and answer:
                faq['items'].append({'question': question, 'answer': answer})
    
    # Method 2: Headings followed by paragraphs
    if not faq['items']:
        headings = faq_section.find_all(['h3', 'h4', 'h5', 'h6'])
        for heading in headings:
            question = extract_text(heading)
            # Find the next paragraph or div
            answer_elem = heading.find_next_sibling(['p', 'div'])
            if answer_elem:
                answer = extract_text(answer_elem)
                if question and answer and len(answer) > 10:
                    faq['items'].append({'question': question, 'answer': answer})
    
    # Method 3: Divs with question/answer classes
    if not faq['items']:
        faq_items = faq_section.find_all(['div', 'article'], class_=re.compile('faq|question|accordion', re.I))
        for item in faq_items:
            # Try to find question and answer within
            q_elem = item.find(class_=re.compile('question|q|title', re.I))
            a_elem = item.find(class_=re.compile('answer|a|content', re.I))
            if q_elem and a_elem:
                question = extract_text(q_elem)
                answer = extract_text(a_elem)
                if question and answer:
                    faq['items'].append({'question': question, 'answer': answer})
    
    # Download images
    images = faq_section.find_all('img')
    print(f"  Found {len(images)} images")
    for idx, img in enumerate(images, 1):
        img_url = img.get('src') or img.get('data-src')
        if img_url:
            downloaded = download_image(img_url, 'faq', idx)
            if downloaded:
                faq['images'].append(downloaded)
                print(f"  ✓ Downloaded: {downloaded}")

print(f"✓ FAQ extracted")
print(f"  - Q&A pairs: {len(faq['items'])}")
print(f"  - Images: {len(faq['images'])}")


Extracting FAQ Section...
  Found 0 images
✓ FAQ extracted
  - Q&A pairs: 0
  - Images: 0


In [9]:
print("Extracting Pricing Section...")

# Look for pricing/packages section
pricing_section = find_section_by_text(soup, "PRICING")
if not pricing_section:
    pricing_section = find_section_by_text(soup, "PACKAGES")
if not pricing_section:
    pricing_section = find_section_by_text(soup, "SOCIAL MEDIA MANAGEMENT")

pricing = {
    'title': '',
    'packages': [],
    'description': '',
    'images': []
}

if pricing_section:
    # Extract title
    title_elem = pricing_section.find(['h1', 'h2', 'h3'])
    if title_elem:
        pricing['title'] = extract_text(title_elem)
    
    # Look for pricing cards or package structures
    # Common patterns: divs with price, features list, package names
    
    # Look for elements containing price indicators ($, /month, etc)
    price_elements = pricing_section.find_all(string=re.compile(r'\$|/month|per month|price', re.I))
    
    package_containers = []
    for elem in price_elements:
        container = elem.find_parent(['div', 'article', 'section'])
        if container and container not in package_containers:
            package_containers.append(container)
    
    # Extract package details
    for container in package_containers:
        package = {
            'name': '',
            'price': '',
            'features': [],
            'description': ''
        }
        
        # Get package name (usually a heading)
        name_elem = container.find(['h3', 'h4', 'h5'])
        if name_elem:
            package['name'] = extract_text(name_elem)
        
        # Get price
        price_text = container.find(string=re.compile(r'\$\d+', re.I))
        if price_text:
            package['price'] = price_text.strip()
        
        # Get features (usually in a list)
        features_list = container.find(['ul', 'ol'])
        if features_list:
            items = features_list.find_all('li')
            package['features'] = [extract_text(item) for item in items if extract_text(item)]
        
        # Get description
        paragraphs = container.find_all('p')
        if paragraphs:
            package['description'] = ' '.join([extract_text(p) for p in paragraphs if extract_text(p)])
        
        if package['name'] or package['price']:
            pricing['packages'].append(package)
    
    # Extract general description
    paragraphs = pricing_section.find_all('p')
    desc_parts = [extract_text(p) for p in paragraphs if extract_text(p) and len(extract_text(p)) > 30]
    if desc_parts:
        pricing['description'] = '\n\n'.join(desc_parts[:2])  # First 2 paragraphs
    
    # Download images
    images = pricing_section.find_all('img')
    print(f"  Found {len(images)} images")
    for idx, img in enumerate(images, 1):
        img_url = img.get('src') or img.get('data-src')
        if img_url:
            downloaded = download_image(img_url, 'pricing', idx)
            if downloaded:
                pricing['images'].append(downloaded)
                print(f"  ✓ Downloaded: {downloaded}")

print(f"✓ Pricing extracted")
print(f"  - Packages: {len(pricing['packages'])}")
print(f"  - Images: {len(pricing['images'])}")


Extracting Pricing Section...
  Found 0 images
✓ Pricing extracted
  - Packages: 0
  - Images: 0


In [10]:
print("Generating Markdown Files...")

def write_markdown(filename, content):
    """Write content to a markdown file"""
    filepath = base_dir / filename
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"  ✓ Created: {filename}")
    return filepath

# 1. Hero Section
hero_md = f"""# {hero_content.get('heading', 'Hero Section')}

{hero_content.get('subheading', '')}

## Description

{hero_content.get('description', '')}

## Call to Action

{hero_content.get('cta', '')}

## Images

"""

for img_path in hero_content.get('images', []):
    hero_md += f"![Hero Image]({img_path})\n\n"

write_markdown('hero.md', hero_md)

# 2. Before/After with Handles
ba_handles_md = f"""# {before_after_handles.get('title', 'Client Before & After Results')}

## Instagram Profile Growth (With Handles)

This section shows before and after Instagram profiles with visible handles and metrics.

## Images

"""

for img_path in before_after_handles.get('images', []):
    ba_handles_md += f"![Before/After Profile]({img_path})\n\n"

if not before_after_handles.get('images'):
    ba_handles_md += "*No images found for this section. Check the website manually.*\n\n"

write_markdown('before-after-handles.md', ba_handles_md)

# 3. Before/After Stats Only
ba_stats_md = f"""# {before_after_stats.get('title', 'Client Growth Statistics')}

## Instagram Profile Growth (Stats Only)

This section shows before and after follower counts without visible handles.

## Images

"""

for img_path in before_after_stats.get('images', []):
    ba_stats_md += f"![Before/After Stats]({img_path})\n\n"

if not before_after_stats.get('images'):
    ba_stats_md += "*No images found for this section. Check the website manually.*\n\n"

write_markdown('before-after-stats.md', ba_stats_md)

print("✓ Before/After sections written")


Generating Markdown Files...
  ✓ Created: hero.md
  ✓ Created: before-after-handles.md
  ✓ Created: before-after-stats.md
✓ Before/After sections written


In [11]:
# 4. Client Results & Testimonials
testimonials_md = f"""# {client_results.get('title', 'Client Results & Testimonials')}

## Testimonial Texts

"""

if client_results.get('testimonials'):
    for idx, testimonial in enumerate(client_results['testimonials'], 1):
        testimonials_md += f"### Testimonial {idx}\n\n"
        testimonials_md += f"> {testimonial}\n\n"
else:
    testimonials_md += "*Note: Testimonial text will be extracted from screenshots below.*\n\n"

testimonials_md += "## Testimonial Screenshots\n\n"
testimonials_md += "*These images contain client feedback, messages, and results. Transcribe text from these images later.*\n\n"

for img_path in client_results.get('images', []):
    testimonials_md += f"![Client Testimonial]({img_path})\n\n"

if not client_results.get('images'):
    testimonials_md += "*No testimonial images found.*\n\n"

write_markdown('client-results.md', testimonials_md)

# 5. How It Works
how_it_works_md = f"""# {how_it_works.get('title', 'How It Works')}

## Description

{how_it_works.get('description', '')}

## Process Steps

"""

if how_it_works.get('steps'):
    for idx, step in enumerate(how_it_works['steps'], 1):
        how_it_works_md += f"{idx}. {step}\n\n"
else:
    how_it_works_md += "*No process steps found. Content may need to be extracted manually.*\n\n"

how_it_works_md += "## Images\n\n"

for img_path in how_it_works.get('images', []):
    how_it_works_md += f"![Process Image]({img_path})\n\n"

write_markdown('how-it-works.md', how_it_works_md)

print("✓ Client results and How It Works written")


  ✓ Created: client-results.md
  ✓ Created: how-it-works.md
✓ Client results and How It Works written


In [12]:
# 6. FAQ
faq_md = f"""# {faq.get('title', 'Frequently Asked Questions')}

"""

if faq.get('items'):
    for idx, item in enumerate(faq['items'], 1):
        faq_md += f"## {item.get('question', f'Question {idx}')}\n\n"
        faq_md += f"{item.get('answer', '')}\n\n"
else:
    faq_md += "*No FAQ items found. Content may need to be extracted manually from the HTML.*\n\n"

if faq.get('images'):
    faq_md += "## Images\n\n"
    for img_path in faq['images']:
        faq_md += f"![FAQ Image]({img_path})\n\n"

write_markdown('faq.md', faq_md)

# 7. Pricing
pricing_md = f"""# {pricing.get('title', 'Pricing & Packages')}

## Overview

{pricing.get('description', '')}

## Packages

"""

if pricing.get('packages'):
    for pkg in pricing['packages']:
        pricing_md += f"### {pkg.get('name', 'Package')}\n\n"
        
        if pkg.get('price'):
            pricing_md += f"**Price:** {pkg['price']}\n\n"
        
        if pkg.get('description'):
            pricing_md += f"{pkg['description']}\n\n"
        
        if pkg.get('features'):
            pricing_md += "**Features:**\n\n"
            for feature in pkg['features']:
                pricing_md += f"- {feature}\n"
            pricing_md += "\n"
        
        pricing_md += "---\n\n"
else:
    pricing_md += "*No pricing packages found. Content may need to be extracted manually.*\n\n"

if pricing.get('images'):
    pricing_md += "## Images\n\n"
    for img_path in pricing['images']:
        pricing_md += f"![Pricing Image]({img_path})\n\n"

write_markdown('pricing.md', pricing_md)

print("✓ FAQ and Pricing written")


  ✓ Created: faq.md
  ✓ Created: pricing.md
✓ FAQ and Pricing written


In [13]:
print("\n" + "="*60)
print("EXTRACTION COMPLETE!")
print("="*60 + "\n")

# Save image manifest
manifest_path = base_dir / 'image_manifest.json'
with open(manifest_path, 'w', encoding='utf-8') as f:
    json.dump(dict(image_manifest), f, indent=2)
print(f"✓ Image manifest saved: {manifest_path}\n")

# List all created files
print("📁 Created Files:")
print("-" * 60)

markdown_files = list(base_dir.glob('*.md'))
for md_file in sorted(markdown_files):
    size = md_file.stat().st_size
    print(f"  ✓ {md_file.name} ({size:,} bytes)")

print(f"\n📷 Downloaded Images by Section:")
print("-" * 60)

for section, images in sorted(image_manifest.items()):
    print(f"  {section}: {len(images)} images")
    for img in images:
        print(f"    - {img}")

print(f"\n📊 Statistics:")
print("-" * 60)
print(f"  Total markdown files: {len(markdown_files)}")
print(f"  Total images downloaded: {sum(len(imgs) for imgs in image_manifest.values())}")
print(f"  Total sections: {len(image_manifest)}")

print(f"\n📂 Output Location:")
print("-" * 60)
print(f"  {base_dir.absolute()}")

print("\n" + "="*60)
print("Next Steps:")
print("="*60)
print("1. Review the markdown files for completeness")
print("2. Check downloaded images")
print("3. Manually transcribe text from screenshot images")
print("4. Use this content to create the new website design")
print("="*60)



EXTRACTION COMPLETE!

✓ Image manifest saved: extracted_content/image_manifest.json

📁 Created Files:
------------------------------------------------------------
  ✓ 1 - hero.md (125 bytes)
  ✓ 2 - client before and after result.md (0 bytes)
  ✓ before-after-handles.md (214 bytes)
  ✓ before-after-stats.md (200 bytes)
  ✓ client-results.md (278 bytes)
  ✓ faq.md (87 bytes)
  ✓ hero.md (108 bytes)
  ✓ how-it-works.md (133 bytes)
  ✓ pricing.md (137 bytes)

📷 Downloaded Images by Section:
------------------------------------------------------------

📊 Statistics:
------------------------------------------------------------
  Total markdown files: 9
  Total images downloaded: 0
  Total sections: 0

📂 Output Location:
------------------------------------------------------------
  /Users/marclamy/Documents - Local/Code/external/pblaunch/python/extracted_content

Next Steps:
1. Review the markdown files for completeness
2. Check downloaded images
3. Manually transcribe text from screenshot

## Download ALL Images (Fallback)


In [14]:
print("Downloading ALL images as fallback...")
print("This ensures we capture all images even if section detection missed them.\n")

# Get all unique image URLs
all_img_tags = soup.find_all('img')
all_image_urls = set()

for img in all_img_tags:
    img_url = img.get('src') or img.get('data-src')
    if img_url and img_url.startswith('http'):
        all_image_urls.add(img_url)

print(f"Found {len(all_image_urls)} unique image URLs")

# Download all images to an 'all-images' folder
all_images_section = 'all-images'
all_images_folder = images_dir / all_images_section
all_images_folder.mkdir(exist_ok=True)

downloaded_count = 0
failed_count = 0

for idx, img_url in enumerate(sorted(all_image_urls), 1):
    try:
        # Get file extension
        parsed = urllib.parse.urlparse(img_url)
        ext = Path(parsed.path).suffix or '.jpg'
        if ext not in ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg']:
            ext = '.jpg'
        
        filename = f"image-{idx:03d}{ext}"
        filepath = all_images_folder / filename
        
        # Download
        print(f"  [{idx}/{len(all_image_urls)}] Downloading {filename}...", end=' ')
        response = requests.get(img_url, timeout=15)
        response.raise_for_status()
        
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        rel_path = f"images/{all_images_section}/{filename}"
        image_manifest[all_images_section].append({
            'filename': filename,
            'path': rel_path,
            'url': img_url,
            'size': len(response.content)
        })
        downloaded_count += 1
        print(f"✓ ({len(response.content):,} bytes)")
        
    except Exception as e:
        failed_count += 1
        print(f"✗ Error: {str(e)}")

print(f"\n✓ Image download complete!")
print(f"  Successfully downloaded: {downloaded_count}")
print(f"  Failed: {failed_count}")
print(f"  Location: {all_images_folder.absolute()}")


Downloading ALL images as fallback...
This ensures we capture all images even if section detection missed them.

Found 83 unique image URLs
  [1/83] Downloading image-001.png... ✓ (2,362 bytes)
  [2/83] Downloading image-002.png... ✓ (2,342 bytes)
  [3/83] Downloading image-003.png... ✓ (3,772 bytes)
  [4/83] Downloading image-004.png... ✓ (5,416 bytes)
  [5/83] Downloading image-005.png... ✓ (12,536 bytes)
  [6/83] Downloading image-006.jpeg... ✓ (13,358 bytes)
  [7/83] Downloading image-007.jpeg... ✓ (10,978 bytes)
  [8/83] Downloading image-008.jpeg... ✓ (12,106 bytes)
  [9/83] Downloading image-009.jpeg... ✓ (88,632 bytes)
  [10/83] Downloading image-010.jpeg... ✓ (38,720 bytes)
  [11/83] Downloading image-011.png... ✓ (27,204 bytes)
  [12/83] Downloading image-012.jpeg... ✓ (25,842 bytes)
  [13/83] Downloading image-013.jpeg... ✓ (48,906 bytes)
  [14/83] Downloading image-014.jpeg... ✓ (12,706 bytes)
  [15/83] Downloading image-015.jpeg... ✓ (25,558 bytes)
  [16/83] Downloading im

## Extract ALL Text Content (Comprehensive)


In [None]:
print("Extracting ALL text content comprehensively...")

# Create a comprehensive text extraction
comprehensive_md = "# Complete Website Content\n\n"
comprehensive_md += "This file contains ALL text content from the website, extracted in order.\n\n"
comprehensive_md += "---\n\n"

# Extract all major content sections
main_content = soup.find('body')

if main_content:
    # Get all text-containing elements in order
    for element in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'blockquote']):
        text = extract_text(element)
        if text and len(text) > 5:  # Skip very short or empty elements
            tag_name = element.name
            
            if tag_name in ['h1', 'h2', 'h3']:
                # Major headings
                level = int(tag_name[1])
                comprehensive_md += f"\n{'#' * level} {text}\n\n"
            elif tag_name in ['h4', 'h5', 'h6']:
                # Minor headings
                level = int(tag_name[1])
                comprehensive_md += f"\n{'#' * level} {text}\n\n"
            elif tag_name == 'li':
                # List items
                comprehensive_md += f"- {text}\n"
            elif tag_name == 'blockquote':
                # Blockquotes
                comprehensive_md += f"\n> {text}\n\n"
            else:
                # Regular paragraphs
                comprehensive_md += f"{text}\n\n"

write_markdown('complete-content.md', comprehensive_md)
print("✓ Created complete-content.md with all text content")


In [None]:
# Create an all-images reference markdown file
all_images_md = """# All Website Images

This file contains all images downloaded from the website in order.
Use this to manually categorize images into their appropriate sections.

## Images

"""

if all_images_section in image_manifest:
    for img_data in image_manifest[all_images_section]:
        all_images_md += f"### {img_data['filename']}\n\n"
        all_images_md += f"![{img_data['filename']}]({img_data['path']})\n\n"
        all_images_md += f"- **URL:** {img_data['url']}\n"
        all_images_md += f"- **Size:** {img_data['size']:,} bytes\n\n"
        all_images_md += "---\n\n"

write_markdown('all-images.md', all_images_md)
print("✓ Created all-images.md reference file")
