In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime

def fetch_pelosi_holdings(max_retries: int = 3, timeout: int = 10):
    """
    Fetch Nancy Pelosi's current holdings from PelosiTracker with error handling.
    
    Args:
        max_retries: Maximum number of retry attempts
        timeout: Request timeout in seconds
    
    Returns:
        List of holding dictionaries
    """
    url = "https://pelositracker.app/portfolios/nancy-pelosi"
    
    # Proper headers to avoid blocking
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
    }
    
    # Retry logic for network requests
    resp = None
    for attempt in range(max_retries):
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            resp.raise_for_status()
            break
        except requests.exceptions.Timeout:
            if attempt == max_retries - 1:
                raise Exception(f"Request timed out after {max_retries} attempts")
            print(f"âš  Timeout on attempt {attempt + 1}/{max_retries}, retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)  # Exponential backoff
        except requests.exceptions.ConnectionError as e:
            if attempt == max_retries - 1:
                raise Exception(f"Connection error after {max_retries} attempts: {e}")
            print(f"âš  Connection error on attempt {attempt + 1}/{max_retries}, retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)
        except requests.exceptions.HTTPError as e:
            if resp and resp.status_code == 404:
                raise Exception(f"Page not found (404). URL may have changed: {url}")
            elif resp and resp.status_code == 403:
                raise Exception(f"Access forbidden (403). Website may be blocking requests.")
            elif attempt == max_retries - 1:
                raise Exception(f"HTTP error after {max_retries} attempts: {e}")
            print(f"âš  HTTP error {resp.status_code if resp else 'unknown'} on attempt {attempt + 1}/{max_retries}, retrying...")
            time.sleep(2 ** attempt)
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                raise Exception(f"Request failed after {max_retries} attempts: {e}")
            print(f"âš  Request error on attempt {attempt + 1}/{max_retries}, retrying in {2 ** attempt} seconds...")
            time.sleep(2 ** attempt)
    
    if resp is None:
        raise Exception("Failed to get response after all retry attempts")
    
    # Parse HTML with error handling
    try:
        soup = BeautifulSoup(resp.text, 'html.parser')
    except Exception as e:
        raise Exception(f"Failed to parse HTML: {e}")
    
    # Find the holdings table - try multiple strategies
    holdings_table = None
    
    # Strategy 1: Look for table with specific column names
    for table in soup.find_all('table'):
        try:
            headers = [th.get_text(strip=True).lower() for th in table.find_all(['th', 'td'])]
            header_text = ' '.join(headers)
            if any(keyword in header_text for keyword in ['ticker', 'symbol']) and 'weight' in header_text:
                holdings_table = table
                break
        except Exception as e:
            print(f"âš  Warning: Error checking table: {e}")
            continue
    
    # Strategy 2: Look for table containing common tickers (fallback)
    if holdings_table is None:
        common_tickers = ['NVDA', 'GOOGL', 'AAPL', 'MSFT', 'TSLA']
        for table in soup.find_all('table'):
            try:
                table_text = table.get_text()
                if any(ticker in table_text for ticker in common_tickers):
                    holdings_table = table
                    break
            except Exception:
                continue
    
    if holdings_table is None:
        raise ValueError("Could not find holdings table on PelosiTracker page. Website structure may have changed.")
    
    # Extract headers more robustly
    headers = []
    try:
        thead = holdings_table.find('thead')
        if thead:
            headers = [th.get_text(strip=True) for th in thead.find_all(['th', 'td'])]
        else:
            # Try first row as headers
            first_row = holdings_table.find('tr')
            if first_row:
                headers = [th.get_text(strip=True) for th in first_row.find_all(['th', 'td'])]
    except Exception as e:
        raise Exception(f"Failed to extract table headers: {e}")
    
    if not headers:
        raise ValueError("Could not extract table headers. Table structure may have changed.")
    
    # Extract rows with error handling
    holdings = []
    try:
        tbody = holdings_table.find('tbody')
        rows = tbody.find_all('tr') if tbody else holdings_table.find_all('tr')[1:]
        
        for row in rows:
            try:
                cells = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
                if len(cells) >= len(headers):  # Allow extra cells
                    holding = dict(zip(headers, cells[:len(headers)]))
                    # Basic validation - ensure we have a ticker
                    ticker_keys = ['Ticker', 'ticker', 'Symbol', 'symbol']
                    if any(holding.get(key) for key in ticker_keys):
                        holdings.append(holding)
            except Exception as e:
                print(f"âš  Warning: Error processing row: {e}")
                continue
    except Exception as e:
        raise Exception(f"Failed to extract table rows: {e}")
    
    if not holdings:
        raise ValueError("No holdings found in table. Data may be empty or structure changed.")
    
    return holdings

def convert_weight_to_float(weight_str):
    """Convert weight string (e.g., '19%') to float (e.g., 19.0)."""
    if isinstance(weight_str, (int, float)):
        return float(weight_str)
    
    # Remove % and whitespace, convert to float
    weight_str = str(weight_str).strip().replace('%', '').strip()
    try:
        return float(weight_str)
    except ValueError:
        return None

def generate_allocation_from_holdings(holdings):
    """Convert holdings list to current_allocation.json format."""
    allocations = {}
    
    for holding in holdings:
        # Handle different possible key names
        ticker = holding.get('Ticker') or holding.get('ticker') or holding.get('Symbol') or holding.get('symbol')
        weight = holding.get('Weight') or holding.get('weight') or holding.get('Allocation') or holding.get('allocation')
        
        if not ticker:
            print(f"âš  Warning: Holding missing ticker: {holding}")
            continue
        
        if weight is None:
            print(f"âš  Warning: Holding {ticker} missing weight, skipping")
            continue
        
        # Convert weight to float
        weight_float = convert_weight_to_float(weight)
        if weight_float is None or weight_float <= 0:
            print(f"âš  Warning: Skipping {ticker} with invalid weight: {weight}")
            continue
        
        allocations[ticker] = weight_float
    
    return allocations

# Main execution with comprehensive error handling
try:
    print("ðŸ”„ Fetching Nancy Pelosi's holdings from PelosiTracker...")
    holdings = fetch_pelosi_holdings()
    
    print(f"âœ“ Successfully scraped {len(holdings)} holdings")
    if holdings:
        print(f"âœ“ Sample holding: {holdings[0]}")
    
    # Generate allocation file for execute_orders.py
    print("\nðŸ”„ Generating allocation file...")
    allocations = generate_allocation_from_holdings(holdings)
    
    if not allocations:
        raise ValueError("No valid allocations found after processing holdings")
    
    allocation_data = {
        "data_as_of_date": datetime.now().strftime("%Y-%m-%d"),
        "allocations": allocations
    }
    
    allocation_file = "current_allocation.json"
    try:
        with open(allocation_file, "w") as f:
            json.dump(allocation_data, f, indent=2)
        print(f"âœ“ Generated {allocation_file} with {len(allocations)} allocations")
        print(f"âœ“ Total allocation: {sum(allocations.values()):.1f}%")
    except IOError as e:
        raise Exception(f"Failed to write allocation file: {e}")
    
except ValueError as e:
    print(f"âœ— Validation Error: {e}")
    # Log error
    try:
        with open("scraping_error.log", "a") as f:
            f.write(f"{datetime.now()}: Validation Error - {str(e)}\n")
    except:
        pass
    raise
    
except Exception as e:
    print(f"âœ— Error: {e}")
    # Log error
    try:
        with open("scraping_error.log", "a") as f:
            f.write(f"{datetime.now()}: {type(e).__name__} - {str(e)}\n")
    except:
        pass
    raise


ðŸ”„ Fetching Nancy Pelosi's holdings from PelosiTracker...
âœ“ Successfully saved 12 holdings to nancy_pelosi_current_holdings_from_pelositracker.json
âœ“ Sample holding: {'Ticker': 'NVDA', 'Last Price': '$190.05', 'Weight': '19%'}
