# ProjectSight File Downloader (UI Approach)

This notebook downloads files from ProjectSight using **UI interactions** instead of JavaScript APIs.

## Why This Approach?

| Aspect | JavaScript API | UI Clicks (this notebook) |
|--------|---------------|---------------------------|
| Complexity | Complex - need to find internal APIs | Simple - just click links |
| Reliability | May break if API changes | More stable - UI rarely changes |
| Learning | Requires understanding Angular internals | Uses standard Playwright patterns |
| Speed | Faster (batch downloads) | Slower (one file at a time) |

## What We Keep

We still use JavaScript for **reading data** (fast, no scrolling needed):
```javascript
grid.dataSource.data()  // All 8000+ records instantly available
```

## What Changes

Downloads use **UI clicks + `expect_download()`** instead of `DMSSystem.InitiateSelectedFilesRequest()`

In [None]:
# Core imports
from playwright.async_api import async_playwright, Page, Frame, Download
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime
import os
import re
import json
import pandas as pd
from dataclasses import dataclass, field

# Load environment variables
load_dotenv(Path('/home/pdcur/samsung-project/.env'))


def windows_to_wsl_path(windows_path: str) -> Path:
    """
    Convert Windows path to WSL2 path.

    Examples:
        C:\\Users\\name\\folder -> /mnt/c/Users/name/folder
        D:\\Data -> /mnt/d/Data
    """
    if not windows_path:
        return Path('.')

    # Already a Unix path? Return as-is
    if windows_path.startswith('/'):
        return Path(windows_path)

    # Convert backslashes to forward slashes
    path = windows_path.replace('\\', '/')

    # Match drive letter pattern (C:/ or C:)
    match = re.match(r'^([A-Za-z]):[/]?(.*)$', path)
    if match:
        drive = match.group(1).lower()
        rest = match.group(2)
        return Path(f'/mnt/{drive}/{rest}')

    return Path(windows_path)


@dataclass
class Config:
    """Central configuration."""
    username: str = os.getenv('PROJECTSIGHT_USERNAME_2', '')
    password: str = os.getenv('PROJECTSIGHT_PASSWORD_2', '')
    org_id: str = '4540f425-f7b5-4ad8-837d-c270d5d09490'
    project_id: int = 3

    # Filter settings
    spec_section_filter: str = '05 12 00 - Structural Steel'

    # Paths
    session_path: Path = Path.home() / '.projectsight_session.json'
    base_url: str = 'https://prod.projectsightapp.trimble.com'

    # Output paths
    output_dir: Path = field(init=False)
    manifest_path: Path = field(init=False)
    downloads_dir: Path = field(init=False)

    def __post_init__(self):
        filter_slug = self.spec_section_filter.split(' - ')[0].replace(' ', '_')
        # Convert Windows path to WSL path
        data_dir = windows_to_wsl_path(os.getenv('WINDOWS_DATA_DIR', ''))
        base_output = data_dir / 'raw' / 'projectsight' / 'submittals_ui'
        self.output_dir = base_output / filter_slug
        self.manifest_path = self.output_dir / 'manifest.json'
        self.downloads_dir = self.output_dir / 'files'

    @property
    def submittals_url(self) -> str:
        return f"{self.base_url}/web/app/Project?listid=-4045&orgid={self.org_id}&projid={self.project_id}"

config = Config()
config.output_dir.mkdir(parents=True, exist_ok=True)
config.downloads_dir.mkdir(parents=True, exist_ok=True)

print(f"Username: {config.username}")
print(f"Filter: {config.spec_section_filter}")
print(f"Downloads: {config.downloads_dir}")

## Manifest (Progress Tracking)

Tracks which files we've already downloaded so we can safely re-run.

In [None]:
@dataclass
class Manifest:
    """Tracks download progress for safe re-runs."""
    filter_applied: str = ''
    processed_submittals: set = field(default_factory=set)
    downloaded_files: dict = field(default_factory=dict)  # file_id -> metadata
    last_updated: str = ''
    
    @classmethod
    def load(cls, path: Path) -> 'Manifest':
        if path.exists():
            data = json.loads(path.read_text())
            return cls(
                filter_applied=data.get('filter_applied', ''),
                processed_submittals=set(data.get('processed_submittals', [])),
                downloaded_files=data.get('downloaded_files', {}),
                last_updated=data.get('last_updated', '')
            )
        return cls()
    
    def save(self, path: Path) -> None:
        self.last_updated = datetime.now().isoformat()
        path.write_text(json.dumps({
            'filter_applied': self.filter_applied,
            'processed_submittals': list(self.processed_submittals),
            'downloaded_files': self.downloaded_files,
            'last_updated': self.last_updated
        }, indent=2))

manifest = Manifest.load(config.manifest_path)
print(f"Previously processed: {len(manifest.processed_submittals)} submittals")
print(f"Previously downloaded: {len(manifest.downloaded_files)} files")

## Browser & Login

Standard Playwright pattern:
1. Launch browser (visible so you can watch)
2. Load saved session if available
3. Login if session expired

In [None]:
async def launch_browser(headless: bool = False):
    """
    Launch browser with saved session.
    
    Returns: (playwright, browser, context, page)
    """
    pw = await async_playwright().start()
    browser = await pw.chromium.launch(headless=headless)
    
    # Context options - viewport size and saved cookies
    context_options = {"viewport": {"width": 1920, "height": 1080}}
    if config.session_path.exists():
        context_options["storage_state"] = str(config.session_path)
        print("Loading saved session...")
    
    context = await browser.new_context(**context_options)
    page = await context.new_page()
    
    return pw, browser, context, page


async def login_if_needed(page: Page, context) -> bool:
    """
    Navigate to ProjectSight and login if needed.
    
    Playwright Tips:
    - page.goto() navigates to a URL
    - page.url gives current URL (useful for checking redirects)
    - page.fill() types into input fields
    - page.get_by_role() finds elements by accessibility role
    """
    await page.goto(config.submittals_url, wait_until='domcontentloaded')
    await page.wait_for_timeout(2000)
    
    # Check if redirected to login page
    if 'id.trimble.com' not in page.url and 'sign_in' not in page.url:
        print("Already authenticated!")
        return False
    
    print("Login required...")
    
    # Dismiss cookie banner if present
    try:
        await page.get_by_role('button', name='Reject All').click(timeout=3000)
    except:
        pass
    
    # Enter username
    await page.fill('#username-field', config.username)
    await page.keyboard.press('Tab')  # Blur field to enable Next button
    await page.get_by_role('button', name='Next').click()
    
    # Enter password
    await page.wait_for_selector('input[name="password"]', timeout=5000)
    await page.fill('input[name="password"]', config.password)
    await page.keyboard.press('Tab')
    await page.get_by_role('button', name='Sign in').click()
    
    # Wait for redirect back to ProjectSight
    await page.wait_for_url('**projectsight**', timeout=15000)
    print("Logged in!")
    
    # Save session for next time
    await context.storage_state(path=str(config.session_path))
    return True

## Apply Filter

Interacts with the Spec Section dropdown to filter submittals.

In [None]:
async def apply_spec_section_filter(page: Page, spec_section: str) -> None:
    """
    Apply a CSI code filter using the search panel dropdown.
    
    Playwright Tips:
    - page.locator('iframe').content_frame gives a FrameLocator
    - FrameLocator lets you chain locators inside the iframe
    - get_by_role('gridcell', name='...') finds grid cells by text
    """
    print(f"Applying filter: {spec_section}")
    
    # Get frame locator for the content iframe
    frame = page.locator('iframe[name="fraMenuContent"]').content_frame
    
    # 1. Open the Spec Section dropdown
    await frame.locator('#ucSearchPanel_ctl38_txtCSICodeLookupInput').click(timeout=5000)
    await page.wait_for_timeout(500)
    
    # 2. Get CSI division (first 2 digits) to expand the right section
    csi_code = spec_section.split(' - ')[0]  # "05 12 00"
    division = csi_code.split()[0]  # "05"
    
    # Division container IDs (found by inspecting the DOM)
    division_ids = {
        '03': '73001', '05': '73003', '07': '73005', '09': '73007',
        '22': '73018', '23': '73019', '26': '73022'
    }
    
    container_id = division_ids.get(division)
    if container_id:
        try:
            expand_btn = frame.locator(f'#ucSearchPanel_ctl38_CSICodePopupTreeContainer_{container_id}').get_by_title('Expand Row')
            await expand_btn.click(timeout=2000)
            print(f"Expanded division {division}")
        except:
            print(f"Division {division} already expanded")
    
    await page.wait_for_timeout(500)
    
    # 3. Click on the specific section
    await frame.get_by_role('gridcell', name=spec_section).click(timeout=5000)
    print(f"Selected: {spec_section}")
    
    # 4. Switch to list view (easier to work with)
    await page.wait_for_timeout(1000)
    await frame.locator('#imgSwitchToListView').click(timeout=2000)
    print("Switched to list view")
    
    await page.wait_for_timeout(2000)

## Data Extraction (Efficient JavaScript)

We use JavaScript to get ALL submittal data instantly - no scrolling needed.

**Why this works:** The Infragistics grid loads all data client-side, even though it only renders visible rows. We can access the full dataset via `grid.dataSource.data()`.

In [None]:
async def get_all_submittals(page: Page) -> list[dict]:
    """
    Get ALL submittal data from the grid (no scrolling needed).
    
    Playwright Tips:
    - page.frame(name='...') gets a Frame object for iframes
    - frame.evaluate() runs JavaScript in the browser
    - The JavaScript returns data that Playwright converts to Python
    """
    frame = page.frame(name='fraMenuContent')
    if not frame:
        raise RuntimeError("Content frame not found")
    
    # This JavaScript gets all data from the grid
    return await frame.evaluate('''
        () => {
            const grid = $("#ugDataView").data("igGrid");
            if (!grid) throw new Error("Grid not found");
            
            // Get all rows and clean up internal properties
            return grid.dataSource.data().map(row => {
                const clean = {};
                for (const [key, value] of Object.entries(row)) {
                    // Skip internal properties (start with $ or _)
                    if (key.startsWith('$') || key.startsWith('_')) continue;
                    if (typeof value === 'function') continue;
                    clean[key] = value;
                }
                return clean;
            });
        }
    ''')


async def scroll_to_row(page: Page, row_index: int) -> None:
    """Scroll the virtualized grid to show a specific row."""
    frame = page.frame(name='fraMenuContent')
    await frame.evaluate('(idx) => $("#ugDataView").data("igGrid").virtualScrollTo(idx)', row_index)


async def click_row_by_id(page: Page, submittal_id: int) -> bool:
    """
    Click a row to load its details in the sidebar.
    
    Returns True if the row was found and clicked.
    """
    frame = page.frame(name='fraMenuContent')
    
    clicked = await frame.evaluate('''
        (id) => {
            const row = document.querySelector(`tr[data-id="${id}"]`);
            if (row) { row.click(); return true; }
            return false;
        }
    ''', submittal_id)
    
    if clicked:
        await page.wait_for_timeout(500)  # Wait for sidebar to load
    return clicked

## UI-Based Download (The Simple Approach)

Instead of calling JavaScript APIs, we:
1. Find file links in the sidebar
2. Click each link
3. Use `expect_download()` to capture the download

**Key Playwright Concept: `expect_download()`**

```python
async with page.expect_download() as download_info:
    await link.click()  # This triggers a download
download = await download_info.value
await download.save_as(path)  # Save the file
```

This pattern captures browser downloads and lets you save them to a specific path.

In [None]:
async def get_file_links_in_sidebar(page: Page) -> list[dict]:
    """
    Find all downloadable file links in the sidebar.
    
    Returns list of {element, file_name, file_id} dicts.
    """
    frame = page.frame(name='fraMenuContent')
    
    # Get file info from Angular scope (for metadata)
    file_info = await frame.evaluate('''
        () => {
            const items = document.querySelectorAll('.singleLinkedItem');
            const results = [];
            const seen = new Set();
            
            for (const item of items) {
                try {
                    const scope = angular.element(item).scope();
                    const att = scope?.attachment;
                    if (!att || seen.has(att.FileID)) continue;
                    seen.add(att.FileID);
                    
                    results.push({
                        file_id: att.FileID,
                        file_name: att.FileName,
                        file_size: att.FileSize || 0
                    });
                } catch(e) {}
            }
            return results;
        }
    ''')
    
    return file_info


async def download_file_by_clicking(page: Page, file_id: str, save_path: Path) -> bool:
    """
    Download a file by clicking its link in the sidebar.
    
    This is the KEY function - uses UI interaction instead of JavaScript APIs.
    
    Playwright Tips:
    - expect_download() waits for a download to start
    - download.save_as() saves to a specific path
    - We use a CSS selector that finds the link by the file ID in Angular scope
    """
    frame = page.frame(name='fraMenuContent')
    
    # Find the clickable link for this file
    # The file links are <a> tags inside .singleLinkedItem elements
    # We need to find the right one by checking Angular scope
    
    # First, get the index of the item with this file_id
    item_index = await frame.evaluate('''
        (fileId) => {
            const items = document.querySelectorAll('.singleLinkedItem');
            for (let i = 0; i < items.length; i++) {
                try {
                    const scope = angular.element(items[i]).scope();
                    if (scope?.attachment?.FileID === fileId) return i;
                } catch(e) {}
            }
            return -1;
        }
    ''', file_id)
    
    if item_index < 0:
        print(f"  File link not found for {file_id}")
        return False
    
    # Get the clickable link within that item
    frame_locator = page.locator('iframe[name="fraMenuContent"]').content_frame
    file_link = frame_locator.locator('.singleLinkedItem').nth(item_index).locator('a').first
    
    try:
        # Use expect_download to capture the file
        async with page.expect_download(timeout=30000) as download_info:
            await file_link.click()
        
        download = await download_info.value
        await download.save_as(save_path)
        return True
        
    except Exception as e:
        print(f"  Download failed: {e}")
        return False

## Main Workflow

Combines everything into a simple loop:
1. For each submittal row
2. Click to load sidebar
3. Find file links
4. Click each link to download

In [None]:
async def download_files_for_submittal(
    page: Page, 
    submittal_id: int, 
    submittal_number: str,
    manifest: Manifest
) -> int:
    """
    Download all files for a single submittal using UI clicks.
    
    Returns number of new files downloaded.
    """
    # Get file links from sidebar
    files = await get_file_links_in_sidebar(page)
    
    if not files:
        return 0
    
    downloaded = 0
    for file_info in files:
        file_id = file_info['file_id']
        file_name = file_info['file_name']
        
        # Skip if already downloaded
        if file_id in manifest.downloaded_files:
            continue
        
        # Create safe filename: SubmittalNumber_FileName
        safe_name = f"{submittal_number}_{file_name}".replace('/', '_').replace('\\', '_')
        save_path = config.downloads_dir / safe_name
        
        print(f"  Downloading: {file_name}")
        
        success = await download_file_by_clicking(page, file_id, save_path)
        
        if success:
            manifest.downloaded_files[file_id] = {
                'file_name': file_name,
                'submittal_id': submittal_id,
                'submittal_number': submittal_number,
                'saved_as': str(save_path),
                'downloaded_at': datetime.now().isoformat()
            }
            downloaded += 1
    
    return downloaded

---

## Run the Extraction

### Step 1: Launch Browser

In [None]:
# Launch browser and login
pw, browser, context, page = await launch_browser(headless=False)
await login_if_needed(page, context)
print("Browser ready!")

### Step 2: Apply Filter

In [None]:
# Apply spec section filter
await apply_spec_section_filter(page, config.spec_section_filter)

# Update manifest
manifest.filter_applied = config.spec_section_filter
manifest.save(config.manifest_path)

### Step 3: Get Submittal Data

In [None]:
# Get all submittals (instant - no scrolling)
submittals = await get_all_submittals(page)
print(f"Found {len(submittals)} submittals")

# Save to CSV
df = pd.DataFrame(submittals)
csv_path = config.output_dir / 'submittals.csv'
df.to_csv(csv_path, index=False)
print(f"Saved to {csv_path}")

# Show sample
df[['number', 'subject', 'specsection']].head()

### Step 4: Download Files (UI Approach)

This is the main loop. For each submittal:
1. Scroll to make the row visible
2. Click the row to load sidebar
3. Click each file link to download

In [None]:
# Process submittals and download files
total_downloaded = 0
save_interval = 10  # Save manifest every N submittals

for i, sub in enumerate(submittals):
    submittal_id = sub['submittalregid']
    submittal_number = sub.get('number', str(submittal_id))
    
    # Skip if already processed
    if submittal_id in manifest.processed_submittals:
        continue
    
    # Progress update
    if i % 10 == 0:
        print(f"Processing {i+1}/{len(submittals)}: {submittal_number}")
    
    # Scroll to row and click it
    await scroll_to_row(page, i)
    await page.wait_for_timeout(200)
    
    if not await click_row_by_id(page, submittal_id):
        continue
    
    # Download files using UI clicks
    downloaded = await download_files_for_submittal(page, submittal_id, submittal_number, manifest)
    total_downloaded += downloaded
    
    # Mark as processed
    manifest.processed_submittals.add(submittal_id)
    
    # Periodic save
    if i % save_interval == 0:
        manifest.save(config.manifest_path)

# Final save
manifest.save(config.manifest_path)

print(f"\nDone!")
print(f"Total submittals processed: {len(manifest.processed_submittals)}")
print(f"Total files downloaded: {len(manifest.downloaded_files)}")
print(f"New downloads this run: {total_downloaded}")

### Step 5: Review Downloads

In [None]:
# List downloaded files
downloaded_files = list(config.downloads_dir.glob('*'))
print(f"Files in downloads folder: {len(downloaded_files)}")

# Show some examples
for f in downloaded_files[:10]:
    size_kb = f.stat().st_size / 1024
    print(f"  {f.name} ({size_kb:.1f} KB)")

## Cleanup

In [None]:
# Close browser
await browser.close()
await pw.stop()
print("Browser closed")

---

## Comparison: JavaScript vs UI Approach

| Aspect | JavaScript (other notebook) | UI Clicks (this notebook) |
|--------|----------------------------|---------------------------|
| **Download code** | `DMSSystem.InitiateSelectedFilesRequest()` | `page.expect_download()` + click |
| **Batch downloads** | Yes (50+ files at once) | No (one at a time) |
| **File naming** | System generates zip names | You control filenames |
| **Speed** | Fast | Slower |
| **Complexity** | Need to find internal APIs | Standard Playwright |
| **Reliability** | May break if APIs change | More stable |

## Key Playwright Patterns Used

```python
# 1. Working with iframes
frame = page.frame(name='fraMenuContent')  # Get Frame object
frame_locator = page.locator('iframe').content_frame  # Get FrameLocator

# 2. Running JavaScript in browser
result = await frame.evaluate('() => someJsCode()')  # Returns data to Python

# 3. Capturing downloads
async with page.expect_download() as download_info:
    await link.click()
download = await download_info.value
await download.save_as('/path/to/file.pdf')

# 4. Finding elements
locator = frame_locator.locator('.class')  # CSS selector
locator = frame_locator.get_by_role('button', name='Click')  # By role
locator = frame_locator.locator('selector').nth(0)  # First match
```