# ProjectSight File Downloader

This notebook demonstrates how to use **Playwright** to:
1. Authenticate with a web application
2. Apply filters to narrow down results
3. Extract full submittal data from virtualized grids
4. Download all attached files
5. Maintain an idempotent manifest (safe to re-run)

## Playwright Key Concepts

| Concept | Description |
|---------|-------------|
| `Page` | A browser tab - your main interaction point |
| `Frame` | An iframe within a page (ProjectSight uses these) |
| `Locator` | A way to find elements - preferred over raw selectors |
| `evaluate()` | Run JavaScript in the browser context |
| `wait_for_*` | Wait for conditions before proceeding |

In [1]:
# Core imports
from playwright.async_api import async_playwright, Page, Frame, Locator
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime
import os
import re
import json
import pandas as pd
from dataclasses import dataclass, field
from typing import Optional

# Load environment variables
load_dotenv(Path('/home/pdcur/samsung-project/.env'))


def windows_to_wsl_path(windows_path: str) -> Path:
    """
    Convert Windows path to WSL2 path.

    Examples:
        C:\\Users\\name\\folder -> /mnt/c/Users/name/folder
        D:\\Data -> /mnt/d/Data
    """
    if not windows_path:
        return Path('.')

    # Already a Unix path? Return as-is
    if windows_path.startswith('/'):
        return Path(windows_path)

    # Convert backslashes to forward slashes
    path = windows_path.replace('\\', '/')

    # Match drive letter pattern (C:/ or C:)
    match = re.match(r'^([A-Za-z]):[/]?(.*)$', path)
    if match:
        drive = match.group(1).lower()
        rest = match.group(2)
        return Path(f'/mnt/{drive}/{rest}')

    return Path(windows_path)


# Configuration
@dataclass
class Config:
    """Central configuration for the scraper."""
    username: str = os.getenv('PROJECTSIGHT_USERNAME_2', '')
    password: str = os.getenv('PROJECTSIGHT_PASSWORD_2', '')
    org_id: str = '4540f425-f7b5-4ad8-837d-c270d5d09490'
    project_id: int = 3

    # Filter settings
    spec_section_filter: str = '05 12 00 - Structural Steel'  # CSI code to filter by

    # Paths
    session_path: Path = Path.home() / '.projectsight_session.json'
    base_url: str = 'https://prod.projectsightapp.trimble.com'

    # Output paths - includes filter in folder name for organization
    output_dir: Path = field(init=False)
    manifest_path: Path = field(init=False)
    submittals_csv_path: Path = field(init=False)
    downloads_dir: Path = field(init=False)

    def __post_init__(self):
        # Create filter-specific output directory
        filter_slug = self.spec_section_filter.split(' - ')[0].replace(' ', '_')  # e.g., "05_12_00"
        # Convert Windows path to WSL path
        data_dir = windows_to_wsl_path(os.getenv('WINDOWS_DATA_DIR', ''))
        base_output = data_dir / 'raw' / 'projectsight' / 'submittals'
        self.output_dir = base_output / filter_slug
        self.manifest_path = self.output_dir / 'manifest.json'
        self.submittals_csv_path = self.output_dir / 'submittals.csv'
        self.downloads_dir = self.output_dir / 'files'

    @property
    def submittals_url(self) -> str:
        return f"{self.base_url}/web/app/Project?listid=-4045&orgid={self.org_id}&projid={self.project_id}"

config = Config()

# Ensure output directories exist
config.output_dir.mkdir(parents=True, exist_ok=True)
config.downloads_dir.mkdir(parents=True, exist_ok=True)

print(f"Username: {config.username}")
print(f"Filter: {config.spec_section_filter}")
print(f"Output directory: {config.output_dir}")

Username: pcuriel@mxi.pro
Filter: 05 12 00 - Structural Steel
Output directory: /mnt/c/Users/pdcur/OneDrive - MXI/Samsung - Samsung/Dashboard/Data/raw/projectsight/submittals/05_12_00


## 1. Manifest System (Idempotency)

The manifest tracks:
- **Processed submittals**: Which rows we've already scanned for attachments
- **Downloaded files**: Which files we've already downloaded (by FileID)
- **Filter applied**: What filter was used (for verification)

This allows safe re-runs - we skip already-processed items.

In [2]:
@dataclass
class Manifest:
    """
    Tracks processing state for idempotent runs.
    """
    filter_applied: str = ''  # Track which filter was used
    processed_submittals: set = field(default_factory=set)
    downloaded_files: dict = field(default_factory=dict)
    attachments: list = field(default_factory=list)
    last_updated: str = ''
    
    @classmethod
    def load(cls, path: Path) -> 'Manifest':
        """Load manifest from disk, or create empty one."""
        if path.exists():
            data = json.loads(path.read_text())
            return cls(
                filter_applied=data.get('filter_applied', ''),
                processed_submittals=set(data.get('processed_submittals', [])),
                downloaded_files=data.get('downloaded_files', {}),
                attachments=data.get('attachments', []),
                last_updated=data.get('last_updated', '')
            )
        return cls()
    
    def save(self, path: Path) -> None:
        """Save manifest to disk."""
        self.last_updated = datetime.now().isoformat()
        data = {
            'filter_applied': self.filter_applied,
            'processed_submittals': list(self.processed_submittals),
            'downloaded_files': self.downloaded_files,
            'attachments': self.attachments,
            'last_updated': self.last_updated
        }
        path.write_text(json.dumps(data, indent=2))
    
    def is_submittal_processed(self, submittal_id: int) -> bool:
        return submittal_id in self.processed_submittals
    
    def mark_submittal_processed(self, submittal_id: int) -> None:
        self.processed_submittals.add(submittal_id)
    
    def is_file_downloaded(self, file_id: str) -> bool:
        return file_id in self.downloaded_files
    
    def mark_file_downloaded(self, file_id: str, metadata: dict) -> None:
        self.downloaded_files[file_id] = {
            **metadata,
            'downloaded_at': datetime.now().isoformat()
        }
    
    def add_attachment(self, attachment: dict) -> None:
        """Add attachment if not already tracked."""
        if not any(a['file_id'] == attachment['file_id'] for a in self.attachments):
            self.attachments.append(attachment)
    
    def get_pending_downloads(self) -> list[dict]:
        """Get attachments that haven't been downloaded yet."""
        return [a for a in self.attachments if not self.is_file_downloaded(a['file_id'])]
    
    def reset_downloads(self) -> None:
        """Reset download tracking only - keeps processed submittals and attachments."""
        self.downloaded_files = {}
        print(f"Reset downloads. {len(self.attachments)} attachments now pending.")


# Load or create manifest
manifest = Manifest.load(config.manifest_path)
print(f"Manifest loaded:")
print(f"  - Filter: {manifest.filter_applied or '(none)'}")
print(f"  - Processed submittals: {len(manifest.processed_submittals)}")
print(f"  - Known attachments: {len(manifest.attachments)}")
print(f"  - Downloaded files: {len(manifest.downloaded_files)}")
print(f"  - Pending downloads: {len(manifest.get_pending_downloads())}")

Manifest loaded:
  - Filter: 05 12 00 - Structural Steel
  - Processed submittals: 276
  - Known attachments: 947
  - Downloaded files: 20
  - Pending downloads: 927


## 2. Browser & Authentication Functions

In [3]:
async def launch_browser(headless: bool = False) -> tuple:
    """
    Launch browser with optional saved session.
    
    Returns: (playwright, browser, context, page)
    """
    pw = await async_playwright().start()
    browser = await pw.chromium.launch(headless=headless)
    
    context_options = {"viewport": {"width": 1920, "height": 1080}}
    if config.session_path.exists():
        context_options["storage_state"] = str(config.session_path)
        print("Loading saved session...")
    
    context = await browser.new_context(**context_options)
    page = await context.new_page()
    
    return pw, browser, context, page


async def login_if_needed(page: Page, context) -> bool:
    """
    Navigate to ProjectSight and login if session expired.
    Returns True if login was performed.
    """
    await page.goto(config.submittals_url, wait_until='domcontentloaded')
    await page.wait_for_timeout(2000)
    
    needs_login = 'id.trimble.com' in page.url or 'sign_in' in page.url
    
    if not needs_login:
        print("Already authenticated!")
        return False
    
    print("Login required...")
    
    # Dismiss cookie banner
    try:
        await page.get_by_role('button', name='Reject All').click(timeout=3000)
    except:
        pass
    
    # Username
    await page.fill('#username-field', config.username)
    await page.keyboard.press('Tab')
    await page.get_by_role('button', name='Next').click()
    
    # Password
    await page.wait_for_selector('input[name="password"]', timeout=5000)
    await page.fill('input[name="password"]', config.password)
    await page.keyboard.press('Tab')
    await page.get_by_role('button', name='Sign in').click()
    
    await page.wait_for_url('**projectsight**', timeout=15000)
    print(f"Logged in!")
    
    await context.storage_state(path=str(config.session_path))
    return True


async def get_content_frame(page: Page) -> Frame:
    """Get the iframe containing the main content."""
    frame = page.frame(name='fraMenuContent')
    if not frame:
        raise RuntimeError("Content frame not found")
    return frame

## 3. Filter Functions

### Applying Spec Section Filter
The Submittals page has a search panel with filters. We need to:
1. Open the Spec Section dropdown
2. Navigate to the correct CSI division (e.g., "05 - Metals")
3. Select the specific section (e.g., "05 12 00 - Structural Steel")

**Playwright Tip:** Use `Locator` objects for robust element finding. They auto-wait and retry.

In [4]:
async def apply_spec_section_filter(page: Page, frame: Frame, spec_section: str) -> None:
    """
    Apply a Spec Section (CSI code) filter to the submittals grid.
    
    Args:
        page: The Page object (needed for frame locator)
        frame: The content frame
        spec_section: The CSI code to filter by (e.g., "05 12 00 - Structural Steel")
    
    Playwright Tips:
        - `page.locator().content_frame` gets a FrameLocator for iframe content
        - `get_by_role()` finds elements by accessibility role (more robust)
        - `get_by_title()` finds elements by their title attribute
    """
    print(f"Applying filter: {spec_section}")
    
    # Get frame locator for cleaner syntax
    frame_locator = page.locator('iframe[name="fraMenuContent"]').content_frame
    
    # 1. Click the Spec Section input to open the dropdown
    spec_input = frame_locator.locator('#ucSearchPanel_ctl38_txtCSICodeLookupInput')
    await spec_input.click(timeout=5000)
    await page.wait_for_timeout(500)
    
    # 2. Parse the CSI code to get the division (first 2 digits)
    # e.g., "05 12 00" -> division "05" (Metals)
    csi_code = spec_section.split(' - ')[0]  # "05 12 00"
    division = csi_code.split()[0]  # "05"
    
    # CSI Division codes map to container IDs
    # Division 05 (Metals) -> container ID 73003
    division_container_ids = {
        '03': '73001',  # Concrete
        '05': '73003',  # Metals
        '07': '73005',  # Thermal & Moisture Protection
        '09': '73007',  # Finishes
        '22': '73018',  # Plumbing
        '23': '73019',  # HVAC
        '26': '73022',  # Electrical
        # Add more as needed
    }
    
    container_id = division_container_ids.get(division)
    if container_id:
        # Try to expand the division if not already expanded
        try:
            expand_btn = frame_locator.locator(f'#ucSearchPanel_ctl38_CSICodePopupTreeContainer_{container_id}').get_by_title('Expand Row')
            await expand_btn.click(timeout=2000)
            print(f"Expanded division {division}")
        except:
            print(f"Division {division} already expanded or not found")
    
    await page.wait_for_timeout(500)
    
    # 3. Click on the specific spec section
    await frame_locator.get_by_role('gridcell', name=spec_section).click(timeout=5000)
    print(f"Selected: {spec_section}")
    
    # 4. Wait for filter to apply and switch to list view
    await page.wait_for_timeout(1000)
    
    # Switch to list view for easier parsing
    await frame_locator.locator('#imgSwitchToListView').click(timeout=2000)
    print("Switched to list view")
    
    # Wait for grid to reload with filtered data
    await page.wait_for_timeout(2000)


async def get_current_filter_count(frame: Frame) -> int:
    """
    Get the number of records currently shown (after filter).
    """
    return await frame.evaluate('''
        () => {
            const grid = $("#ugDataView").data("igGrid");
            return grid ? grid.dataSource.data().length : 0;
        }
    ''')

## 4. Data Extraction Functions

In [5]:
async def get_all_submittals_full(frame: Frame) -> list[dict]:
    """
    Get ALL submittal data with ALL fields from the grid.
    
    NOTE: This returns whatever is currently in the grid.
    If a filter is applied, only filtered records are returned.
    """
    return await frame.evaluate('''
        () => {
            const grid = $("#ugDataView").data("igGrid");
            if (!grid) throw new Error("Grid not found");
            
            return grid.dataSource.data().map(row => {
                const clean = {};
                for (const [key, value] of Object.entries(row)) {
                    if (key.startsWith('$') || key.startsWith('_')) continue;
                    if (typeof value === 'function') continue;
                    clean[key] = value;
                }
                return clean;
            });
        }
    ''')


async def scroll_to_row(frame: Frame, row_index: int) -> None:
    """Scroll the virtualized grid to show a specific row."""
    await frame.evaluate('(idx) => $("#ugDataView").data("igGrid").virtualScrollTo(idx)', row_index)


async def dismiss_modal_if_present(page: Page) -> bool:
    """
    Dismiss any modal overlay that might be blocking clicks.
    Returns True if a modal was dismissed.
    """
    frame = page.frame(name='fraMenuContent')
    if not frame:
        return False
    
    dismissed = await frame.evaluate('''
        () => {
            // Try to find and hide modal overlays
            const modals = document.querySelectorAll('.modalBackgroundDiv, .modal-backdrop, [class*="modal"]');
            let dismissed = false;
            for (const modal of modals) {
                if (modal.style.display !== 'none') {
                    modal.style.display = 'none';
                    dismissed = true;
                }
            }
            
            // Also try clicking any close buttons
            const closeButtons = document.querySelectorAll('.modal-close, .close-modal, [class*="close"]');
            for (const btn of closeButtons) {
                try { btn.click(); dismissed = true; } catch(e) {}
            }
            
            // Press Escape to close any modal
            document.dispatchEvent(new KeyboardEvent('keydown', { key: 'Escape', keyCode: 27 }));
            
            return dismissed;
        }
    ''')
    
    if dismissed:
        await page.wait_for_timeout(300)
    
    return dismissed


async def click_row_and_wait_for_sidebar(page: Page, submittal_id: int, timeout_ms: int = 5000, initial_delay_ms: int = 1000) -> bool:
    """
    Click a row using Playwright's native click (not JavaScript .click()).
    This properly triggers Angular's event handlers.
    
    Args:
        page: Playwright page object
        submittal_id: The submittal ID to click
        timeout_ms: Max time to wait for sidebar to load
        initial_delay_ms: Time to wait after click before checking sidebar (default 1 second)
    
    Returns True if sidebar loaded successfully.
    """
    # First, dismiss any modal that might be blocking
    await dismiss_modal_if_present(page)
    
    # Use Playwright's locator to click - this simulates a real user click
    frame_locator = page.locator('iframe[name="fraMenuContent"]').content_frame
    row_locator = frame_locator.locator(f'tr[data-id="{submittal_id}"]')
    
    try:
        # Try normal click first
        await row_locator.click(timeout=2000)
    except Exception as e:
        # If blocked by modal, try to dismiss and retry
        await dismiss_modal_if_present(page)
        await page.wait_for_timeout(500)
        
        try:
            # Retry with force click (bypasses actionability checks)
            await row_locator.click(timeout=2000, force=True)
        except Exception as e2:
            # Last resort: use JavaScript click
            frame = page.frame(name='fraMenuContent')
            clicked = await frame.evaluate('''
                (id) => {
                    const row = document.querySelector(`tr[data-id="${id}"]`);
                    if (row) { row.click(); return true; }
                    return false;
                }
            ''', submittal_id)
            if not clicked:
                return False
    
    # Wait for sidebar to start loading before polling
    await page.wait_for_timeout(initial_delay_ms)
    
    # Wait for sidebar to load by polling for .singleLinkedItem elements
    frame = page.frame(name='fraMenuContent')
    poll_interval = 300  # Increased from 200ms
    max_polls = timeout_ms // poll_interval
    
    for _ in range(max_polls):
        # Check if attachments have loaded or sidebar is ready
        sidebar_ready = await frame.evaluate('''
            () => {
                // Check for attachment items
                const items = document.querySelectorAll('.singleLinkedItem');
                if (items.length > 0) return 'has_attachments';
                
                // Check if detail panel exists and has content
                const panel = document.querySelector('.detailPanelInner, .rightPanelItem');
                if (panel && panel.children.length > 2) return 'panel_loaded';
                
                return false;
            }
        ''')
        
        if sidebar_ready:
            # Give Angular a moment to finish rendering
            await page.wait_for_timeout(300)
            return True
        
        await page.wait_for_timeout(poll_interval)
    
    # Timeout - return True anyway to try extraction
    return True


# Alias for compatibility
async def click_row(frame: Frame, submittal_id: int, page: Page) -> bool:
    """Click a row to load its details sidebar. Returns True if successful."""
    return await click_row_and_wait_for_sidebar(page, submittal_id)


async def extract_attachments(frame: Frame, submittal_id: int, submittal_number: str) -> list[dict]:
    """
    Extract all file attachments from the currently loaded sidebar.
    
    The sidebar uses Angular and attachments are in elements with class 'singleLinkedItem'.
    We access the Angular scope to get file metadata.
    """
    raw = await frame.evaluate('''
        () => {
            // Try multiple selectors - the UI might use different classes
            let items = document.querySelectorAll('.singleLinkedItem');
            
            // If no items found, try alternative selectors
            if (items.length === 0) {
                items = document.querySelectorAll('[ng-repeat*="attachment"]');
            }
            if (items.length === 0) {
                items = document.querySelectorAll('.linked-item, .attachment-item, .file-item');
            }
            
            const results = [];
            const seen = new Set();
            
            for (const item of items) {
                try {
                    // Try to get Angular scope
                    const scope = angular.element(item).scope();
                    
                    // The attachment might be in different scope properties
                    const att = scope?.attachment || scope?.file || scope?.item;
                    if (!att) continue;
                    
                    // FileID might be named differently
                    const fileId = att.FileID || att.fileId || att.Id || att.id;
                    if (!fileId || seen.has(fileId)) continue;
                    seen.add(fileId);
                    
                    // Find field label by looking at parent context
                    let fieldLabel = null;
                    let current = item;
                    while (current && !fieldLabel) {
                        let sibling = current.previousElementSibling;
                        while (sibling) {
                            const text = sibling.textContent?.trim();
                            if (text === 'Description' || text === 'Approved docs') {
                                fieldLabel = text;
                                break;
                            }
                            sibling = sibling.previousElementSibling;
                        }
                        current = current.parentElement;
                    }
                    
                    results.push({
                        file_id: fileId,
                        file_name: att.FileName || att.fileName || att.Name || att.name || 'unknown',
                        file_size: att.FileSize || att.fileSize || att.Size || 0,
                        file_type: att.FileType || att.fileType || att.Type || 'unknown',
                        field_label: fieldLabel || 'unknown'
                    });
                } catch(e) {
                    // Silently continue on error
                }
            }
            return results;
        }
    ''')
    
    for att in raw:
        att['submittal_id'] = submittal_id
        att['submittal_number'] = submittal_number
    
    return raw


async def debug_sidebar(frame: Frame, page: Page) -> dict:
    """
    Debug helper to inspect what's in the sidebar after clicking a row.
    Checks BOTH the iframe and the main page.
    """
    # Check in iframe (frame)
    frame_debug = await frame.evaluate('''
        () => {
            const debug = {
                location: 'iframe',
                singleLinkedItems: document.querySelectorAll('.singleLinkedItem').length,
                ngRepeatAttachment: document.querySelectorAll('[ng-repeat*="attachment"]').length,
                detailPanelExists: !!document.querySelector('.detailPanelInner, .detail-panel'),
                rightPanelItems: document.querySelectorAll('.rightPanelItem').length,
                modalVisible: !!document.querySelector('.modalBackgroundDiv[style*="block"], .modalBackgroundDiv:not([style*="none"])'),
            };
            
            // Get the detail panel HTML snippet for inspection
            const panel = document.querySelector('.detailPanelInner, .rightPanelItem');
            if (panel) {
                debug.panelChildCount = panel.children.length;
                debug.panelTextSnippet = panel.textContent?.substring(0, 300);
            }
            
            // Get relevant classes
            const allElements = document.querySelectorAll('*');
            const classSet = new Set();
            for (const el of allElements) {
                for (const cls of el.classList) {
                    if (cls.match(/attach|file|linked|item|doc|single|detail|panel/i)) {
                        classSet.add(cls);
                    }
                }
            }
            debug.relevantClasses = Array.from(classSet).slice(0, 25);
            
            // Check for Angular scope on singleLinkedItem
            const items = document.querySelectorAll('.singleLinkedItem');
            if (items.length > 0) {
                try {
                    const scope = angular.element(items[0]).scope();
                    debug.scopeKeys = scope ? Object.keys(scope).filter(k => !k.startsWith('$')).slice(0, 10) : [];
                    if (scope?.attachment) {
                        debug.attachmentKeys = Object.keys(scope.attachment);
                    }
                } catch(e) {
                    debug.scopeError = e.message;
                }
            }
            
            return debug;
        }
    ''')
    
    # Check in main page (outside iframe)
    page_debug = await page.evaluate('''
        () => {
            const debug = {
                location: 'main_page',
                singleLinkedItems: document.querySelectorAll('.singleLinkedItem').length,
                ngRepeatAttachment: document.querySelectorAll('[ng-repeat*="attachment"]').length,
            };
            
            return debug;
        }
    ''')
    
    return {
        'iframe': frame_debug,
        'main_page': page_debug
    }

## 5. Download Functions

In [6]:
import zipfile
import tempfile

async def download_files_batch(
    frame: Frame, 
    file_ids: list[str], 
    page: Page,
    downloads_dir: Path,
    batch_num: int = 0,
    timeout_ms: int = 120000  # 2 minutes per batch
) -> tuple[bool, list[dict]]:
    """
    Trigger download for a batch of files, unzip, and extract individual files.
    
    Uses Playwright's expect_download() to verify the download actually happens.
    Automatically unzips and extracts files to the downloads directory.
    
    Args:
        frame: The content frame
        file_ids: List of file IDs to download
        page: Playwright page object
        downloads_dir: Directory to save extracted files
        batch_num: Batch number for logging
        timeout_ms: Max time to wait for download
    
    Returns (success: bool, extracted_files: list[dict] with file info or error message)
    """
    if not file_ids:
        return True, []
    
    try:
        # Use expect_download to capture the actual download
        async with page.expect_download(timeout=timeout_ms) as download_info:
            # Trigger the download
            await frame.evaluate('''
                (ids) => DMSSystem.InitiateSelectedFilesRequest([], ids, [])
            ''', file_ids)
        
        # Get the download object
        download = await download_info.value
        
        # Save to a temp file first
        with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
            tmp_path = Path(tmp.name)
        
        await download.save_as(tmp_path)
        
        # Verify file exists and has content
        if not tmp_path.exists() or tmp_path.stat().st_size == 0:
            tmp_path.unlink(missing_ok=True)
            return False, "Downloaded file is empty or missing"
        
        # Extract files from zip
        extracted_files = []
        try:
            with zipfile.ZipFile(tmp_path, 'r') as zf:
                for zip_info in zf.infolist():
                    if zip_info.is_dir():
                        continue
                    
                    # Get just the filename (ignore any directory structure in zip)
                    original_name = Path(zip_info.filename).name
                    
                    # Handle duplicate filenames by adding suffix
                    dest_path = downloads_dir / original_name
                    counter = 1
                    while dest_path.exists():
                        stem = Path(original_name).stem
                        suffix = Path(original_name).suffix
                        dest_path = downloads_dir / f"{stem}_{counter}{suffix}"
                        counter += 1
                    
                    # Extract file
                    with zf.open(zip_info) as src, open(dest_path, 'wb') as dst:
                        dst.write(src.read())
                    
                    extracted_files.append({
                        'original_name': original_name,
                        'saved_as': dest_path.name,
                        'size': zip_info.file_size,
                        'batch': batch_num
                    })
        finally:
            # Clean up temp zip file
            tmp_path.unlink(missing_ok=True)
        
        return True, extracted_files
            
    except Exception as e:
        error_msg = str(e)
        if "Timeout" in error_msg:
            return False, "Timeout waiting for download"
        return False, f"Error: {error_msg[:100]}"

## 6. Main Workflow Functions

In [7]:
async def collect_attachments_incremental(
    frame: Frame, 
    page: Page,
    submittals: list[dict],
    manifest: Manifest,
    progress_interval: int = 50,
    save_interval: int = 100,
    verbose: bool = False
) -> int:
    """
    Scan submittals for attachments, skipping already-processed ones.
    Uses Playwright's native click to properly trigger Angular events.
    """
    new_processed = 0
    total_attachments_this_run = 0
    
    for i, sub in enumerate(submittals):
        submittal_id = sub['submittalregid']
        submittal_number = sub.get('number', str(submittal_id))
        
        if manifest.is_submittal_processed(submittal_id):
            continue
        
        if new_processed % progress_interval == 0:
            print(f"Progress: {new_processed}/{len(submittals) - len(manifest.processed_submittals)} | "
                  f"Attachments found: {total_attachments_this_run}")
        
        # Find row index in current data
        row_index = await frame.evaluate('''
            (sid) => $("#ugDataView").data("igGrid").dataSource.data().findIndex(r => r.submittalregid === sid)
        ''', submittal_id)
        
        if row_index < 0:
            if verbose:
                print(f"  [{submittal_number}] Row not found in grid, skipping")
            continue
        
        # Scroll to row
        await scroll_to_row(frame, row_index)
        await page.wait_for_timeout(200)
        
        # Click row using Playwright's native click (triggers Angular properly)
        if not await click_row_and_wait_for_sidebar(page, submittal_id):
            if verbose:
                print(f"  [{submittal_number}] Failed to click row")
            continue
        
        # Extract attachments
        attachments = await extract_attachments(frame, submittal_id, submittal_number)
        
        if verbose or (new_processed < 5):  # Always show first 5 for debugging
            print(f"  [{submittal_number}] Found {len(attachments)} attachments")
        
        for att in attachments:
            manifest.add_attachment(att)
            total_attachments_this_run += 1
        
        manifest.mark_submittal_processed(submittal_id)
        new_processed += 1
        
        if new_processed % save_interval == 0:
            manifest.save(config.manifest_path)
    
    manifest.save(config.manifest_path)
    print(f"\nDone! Processed {new_processed} new submittals")
    print(f"Total attachments found this run: {total_attachments_this_run}")
    
    return new_processed


def save_submittals_csv(submittals: list[dict], path: Path) -> None:
    """Save submittals data to CSV."""
    df = pd.DataFrame(submittals)
    
    priority_cols = [
        'submittalregid', 'number', 'subject', 'specsection', 
        'workflowstatename', 'responsiblecompanyname', 'authorcompanyname',
        'datecreated', 'datedue', 'lastmodified'
    ]
    other_cols = [c for c in df.columns if c not in priority_cols]
    df = df[[c for c in priority_cols if c in df.columns] + other_cols]
    
    df.to_csv(path, index=False)
    print(f"Saved {len(df)} submittals to {path}")

## 7. Run the Extraction

### Step 1: Launch Browser & Login

In [10]:
# Launch browser and login
pw, browser, context, page = await launch_browser(headless=False)
await login_if_needed(page, context)

# Get content frame
frame = await get_content_frame(page)
print("Browser ready!")

Loading saved session...
Already authenticated!
Browser ready!


### Step 2: Apply Filter

In [11]:
await page.wait_for_timeout(10000)

# Apply the spec section filter
await apply_spec_section_filter(page, frame, config.spec_section_filter)

# Verify filter was applied
count = await get_current_filter_count(frame)
print(f"\nFiltered results: {count} submittals")

# Update manifest with filter info
manifest.filter_applied = config.spec_section_filter
manifest.save(config.manifest_path)

Applying filter: 05 12 00 - Structural Steel
Expanded division 05
Selected: 05 12 00 - Structural Steel
Switched to list view

Filtered results: 276 submittals


### Step 3: Extract Submittal Data

In [12]:
# Get filtered submittal data
submittals = await get_all_submittals_full(frame)
print(f"Total submittals (filtered): {len(submittals)}")
print(f"Fields available: {list(submittals[0].keys()) if submittals else 'N/A'}")

# Save to CSV
save_submittals_csv(submittals, config.submittals_csv_path)

Total submittals (filtered): 276
Fields available: ['submittalregid', 'is_selected', 'number', 'revision', 'specsection', 'specsubsection', 'subject', 'workflowstatename', 'datedue', 'openassignmentscontactnames', 'datecreated', 'dateresolved', 'authorcompanyname', 'authorcontactname', 'resolutioncompanyname', 'resolutioncontactname', 'responsiblecompanyname', 'location', 'type', 'importance', 'submittalpackage', 'startdate', 'actualdate', 'returneddate', 'scheduleddeliverydate', 'actualdeliverydate', 'workflowtemplatename', 'workflowstepname', 'stepduedate', 'lastmodifiedby', 'lastmodified', 'locationid', 'specsectionid', 'workflowstate', 'authorcompanyid', 'authorcontactid', 'responsiblecompanyid', 'typeid', 'importanceid', 'submittalpackageid', 'details', 'resolution', 'workflowstatecolor', 'islocked', 'guid', 'projectid', 'openassignmentscompanynames', 'udf_submissionref', 'udf_drawingpackagenum', 'udf_scopeofwork', 'udf_scopeofwork_displayvalue', 'udf_purposeofsubmission', 'udf_pu

### Step 4: Scan for Attachments

In [14]:
# Scan for attachments (incremental - skips already processed)
new_count = await collect_attachments_incremental(
    frame, 
    page, 
    submittals,
    manifest,
    progress_interval=50,
    save_interval=100
)

print(f"\nSummary:")
print(f"  Filter: {manifest.filter_applied}")
print(f"  Total submittals processed: {len(manifest.processed_submittals)}")
print(f"  Total attachments found: {len(manifest.attachments)}")
print(f"  Pending downloads: {len(manifest.get_pending_downloads())}")


Done! Processed 0 new submittals
Total attachments found this run: 0

Summary:
  Filter: 05 12 00 - Structural Steel
  Total submittals processed: 276
  Total attachments found: 947
  Pending downloads: 947


### Step 5: Review Before Downloading

In [None]:
# Uncomment to reset downloads if needed
# manifest.reset_downloads()
# manifest.save(config.manifest_path)

In [None]:
# Review pending downloads
pending = manifest.get_pending_downloads()
print(f"Pending downloads: {len(pending)}")

# Show sample
for att in pending[:5]:
    print(f"  [{att['submittal_number']}] {att['file_name']} ({att['file_size']:,} bytes)")

# Total size estimate
total_bytes = sum(att['file_size'] for att in pending)
print(f"\nTotal size: {total_bytes / 1024 / 1024:.2f} MB ({total_bytes / 1024 / 1024 / 1024:.2f} GB)")

### Step 6: Download Files

In [None]:
# Export attachments with submittal linkage
attachments_df = pd.DataFrame(manifest.attachments)
print(f"Columns: {list(attachments_df.columns)}")
print(f"\nSample data:")
attachments_df[['submittal_number', 'file_name', 'file_id', 'field_label']].head(10)

In [None]:
# Save attachments linkage to CSV
attachments_path = config.output_dir / 'attachments.csv'
attachments_df.to_csv(attachments_path, index=False)
print(f"Saved {len(attachments_df)} attachments to {attachments_path}")

# Show summary by submittal
print(f"\nAttachments per submittal:")
print(attachments_df.groupby('submittal_number').size().describe())

In [None]:
# Download files in small batches, unzip and extract individual files
BATCH_SIZE = 10  # Files per batch

pending = manifest.get_pending_downloads()
total_batches = (len(pending) + BATCH_SIZE - 1) // BATCH_SIZE

print(f"Downloading {len(pending)} files in {total_batches} batches of {BATCH_SIZE}...")
print(f"Files will be extracted to: {config.downloads_dir}\n")

successful_batches = 0
total_extracted = 0
failed_batches = []

for batch_num in range(total_batches):
    start_idx = batch_num * BATCH_SIZE
    batch = pending[start_idx:start_idx + BATCH_SIZE]
    
    file_ids = [att['file_id'] for att in batch]
    
    print(f"Batch {batch_num + 1}/{total_batches}: {len(file_ids)} files...", end=" ")
    
    success, result = await download_files_batch(
        frame, file_ids, page, 
        downloads_dir=config.downloads_dir,
        batch_num=batch_num + 1,
        timeout_ms=180000  # 3 minutes per batch
    )
    
    if success:
        extracted_files = result
        total_extracted += len(extracted_files)
        
        # Mark each file as downloaded with extraction info
        for att, extracted in zip(batch, extracted_files):
            manifest.mark_file_downloaded(att['file_id'], {
                'file_name': att['file_name'],
                'file_size': att['file_size'],
                'submittal_id': att['submittal_id'],
                'saved_as': extracted['saved_as'],
                'batch': batch_num + 1
            })
        
        manifest.save(config.manifest_path)
        successful_batches += 1
        print(f"✓ Extracted {len(extracted_files)} files")
    else:
        failed_batches.append({
            'batch': batch_num + 1,
            'file_ids': file_ids,
            'error': result
        })
        print(f"✗ {result}")
    
    # Small delay between batches
    await page.wait_for_timeout(2000)

print(f"\n{'='*50}")
print(f"Download Summary:")
print(f"  Successful batches: {successful_batches}/{total_batches}")
print(f"  Total files extracted: {total_extracted}")
print(f"  Files marked downloaded: {len(manifest.downloaded_files)}")
print(f"  Remaining: {len(manifest.get_pending_downloads())}")
if failed_batches:
    print(f"\nFailed batches ({len(failed_batches)}):")
    for fb in failed_batches[:5]:
        print(f"  Batch {fb['batch']}: {fb['error']}")

### Step 7: Export Summary

In [None]:
# Export attachments manifest to CSV
attachments_df = pd.DataFrame(manifest.attachments)
attachments_path = config.output_dir / 'attachments.csv'
attachments_df.to_csv(attachments_path, index=False)
print(f"Saved {len(attachments_df)} attachments to {attachments_path}")

# Show summary by field type
if 'field_label' in attachments_df.columns:
    print("\nAttachments by type:")
    print(attachments_df['field_label'].value_counts())

## 8. Cleanup

In [None]:
# Close browser
await browser.close()
await pw.stop()
print("Browser closed")

## Utilities

In [None]:
# Reset downloads only (keeps submittals and attachments)
# Uncomment to use:
# manifest.reset_downloads()
# manifest.save(config.manifest_path)
# print(f"Pending downloads: {len(manifest.get_pending_downloads())}")

# DANGER: Reset FULL manifest (uncomment to use)
# if config.manifest_path.exists():
#     config.manifest_path.unlink()
#     print("Manifest deleted")
#     manifest = Manifest()
#     print("Fresh manifest created")

## Quick Reference

### Changing the Filter
Edit `spec_section_filter` in the Config class:
```python
spec_section_filter: str = '05 12 00 - Structural Steel'
# Other examples:
# '03 30 00 - Cast-in-Place Concrete'
# '23 00 00 - HVAC'
# '26 00 00 - Electrical'
```

### Output Structure
```
raw/projectsight/submittals/
└── 05_12_00/                    # Filter-specific folder
    ├── submittals.csv           # Full submittal data
    ├── attachments.csv          # All discovered files
    ├── manifest.json            # Progress tracking
    └── files/                   # Extracted files (auto-unzipped)
        ├── drawing_001.pdf
        ├── drawing_001_1.pdf    # Duplicate renamed
        └── spec_sheet.pdf
```

### Manifest Structure
```json
{
  "filter_applied": "05 12 00 - Structural Steel",
  "processed_submittals": [567, 568, ...],
  "downloaded_files": {
    "file_id_123": {
      "file_name": "original.pdf",
      "saved_as": "original.pdf",
      "batch": 1,
      "downloaded_at": "2024-01-15T10:30:00"
    }
  },
  "attachments": [...],
  "last_updated": "2024-01-15T10:30:00"
}
```

### Resetting Downloads
To re-download all files without re-scanning submittals:
```python
manifest.reset_downloads()
manifest.save(config.manifest_path)
```