In [None]:
import asyncio
from playwright.async_api import async_playwright
import json
import random
import time
import os
import sys

try:
    from IPython.display import clear_output
    IN_NOTEBOOK = True
except ImportError:
    IN_NOTEBOOK = False

class ScraperStatus:
    def __init__(self):
        self.start_time = time.time()
        self.current_click = 0
        self.total_articles = 0
        self.last_action = "Starting..."
        self.current_phase = "Initialization"
        self.consecutive_clicks = 0
        
    def update(self, action=None, phase=None, click=None, articles=None, consecutive=None):
        if action:
            self.last_action = action
        if phase:
            self.current_phase = phase
        if click is not None:
            self.current_click = click
        if articles is not None:
            self.total_articles = articles
        if consecutive is not None:
            self.consecutive_clicks = consecutive
        self.display()
    
    def display(self):
        if IN_NOTEBOOK:
            clear_output(wait=True)
        else:
            os.system('cls' if os.name == 'nt' else 'clear')
        
        runtime = time.time() - self.start_time
        hours, remainder = divmod(runtime, 3600)
        minutes, seconds = divmod(remainder, 60)
        
        print("🕷️  Bloomberg Economics Scraper - Enhanced Human-like Version")
        print("=" * 70)
        print(f"⏱️  Runtime: {int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}")
        print(f"🎯  Current Phase: {self.current_phase}")
        print(f"🔄  Clicks Completed: {self.current_click}")
        print(f"📊  Total Articles: {self.total_articles}")
        print(f"🔗  Consecutive Clicks: {self.consecutive_clicks}")
        print("=" * 70)
        print(f"📝  Current Action: {self.last_action}")
        print("=" * 70)
        
        if self.current_click > 0:
            avg_articles_per_click = self.total_articles / self.current_click if self.current_click > 0 else 0
            print(f"Avg Articles/Click: {avg_articles_per_click:.1f}")
            print(f"Performance: {self.current_click/(runtime/60):.1f} clicks/minute")
        
        environment = "Jupyter Notebook" if IN_NOTEBOOK else "Terminal"
        print(f"\nEnvironment: {environment}")
        print("💡 Tip: The scraper is running human-like behaviors to avoid detection")
        if not IN_NOTEBOOK:
            print("Press Ctrl+C to stop gracefully")
        print()

def log_action(status, action, phase=None):
    """更新并显示当前动作"""
    status.update(action=action, phase=phase)

def log_progress(status, click, articles, action="Progress Update"):
    """更新并显示进度"""
    status.update(action=action, click=click, articles=articles)

async def human_like_mouse_movement(page, target_x, target_y, current_x=None, current_y=None):
    """更拟人的鼠标移动路径"""
    if current_x is None or current_y is None:
        current_x = random.randint(200, 800)
        current_y = random.randint(200, 600)

    distance = ((target_x - current_x) ** 2 + (target_y - current_y) ** 2) ** 0.5

    if distance > 200:
        mid_x = current_x + (target_x - current_x) * 0.6 + random.randint(-50, 50)
        mid_y = current_y + (target_y - current_y) * 0.6 + random.randint(-30, 30)
        
        await page.mouse.move(mid_x, mid_y, steps=random.randint(15, 25))
        await page.wait_for_timeout(random.randint(100, 300))
        await page.mouse.move(target_x, target_y, steps=random.randint(10, 20))
    else:
        steps = max(5, int(distance / 10))
        await page.mouse.move(target_x, target_y, steps=steps)

async def simulate_reading_behavior(page):
    """模拟真实的阅读和浏览行为 - 优化版"""
    behaviors = [
        'scroll_down',
        'scroll_up', 
        'hover_article',
        'quick_scan',
        'pause_thinking'
    ]
    
    selected_behaviors = random.sample(behaviors, random.randint(1, 2))
    
    for behavior in selected_behaviors:
        if behavior == 'scroll_down':
            for _ in range(random.randint(1, 2)):
                await page.mouse.wheel(0, random.randint(150, 300))
                await page.wait_for_timeout(random.randint(300, 800))  
                
        elif behavior == 'scroll_up':
            await page.mouse.wheel(0, random.randint(-400, -200))
            await page.wait_for_timeout(random.randint(400, 1000))  
            
        elif behavior == 'hover_article':
            await hover_on_random_article(page)
            
        elif behavior == 'quick_scan':
            await quick_scan_page(page)
            
        elif behavior == 'pause_thinking':
            await page.wait_for_timeout(random.randint(1000, 2500))
        
        await page.wait_for_timeout(random.randint(200, 500))

async def hover_on_random_article(page):
    """随机悬停在文章标题上，模拟阅读 - 优化版"""
    try:
        articles = await page.query_selector_all('section#archive_story_list a')
        if articles:
            article = random.choice(articles[-10:])
            box = await article.bounding_box()
            if box:
                x = box['x'] + box['width'] / 2
                y = box['y'] + box['height'] / 2
                await human_like_mouse_movement(page, x, y)
                await page.wait_for_timeout(random.randint(800, 1500))
    except:
        pass

async def quick_scan_page(page):
    for _ in range(random.randint(2, 4)):
        x = random.randint(200, 1000)
        y = random.randint(300, 700)
        await page.mouse.move(x, y, steps=random.randint(5, 12))
        await page.wait_for_timeout(random.randint(200, 600))  

async def random_mouse_movements(page):
    """随机鼠标移动 - 优化版"""
    movements = random.randint(2, 4)  
    for _ in range(movements):
        x = random.randint(100, 1200)
        y = random.randint(200, 800)
        await human_like_mouse_movement(page, x, y)
        await page.wait_for_timeout(random.randint(200, 500))  

async def check_page_sections(page):
    """检查页面不同区域 - 优化版"""
    sections = [
        (200, 100),   
        (1000, 300),  
        (150, 500),   
    ]
    
    section = random.choice(sections)
    x, y = section
    await human_like_mouse_movement(page, x, y)
    await page.wait_for_timeout(random.randint(500, 1000))  

async def simulate_headline_reading(page):
    """模拟标题阅读 - 优化版"""
    try:
        headlines = await page.query_selector_all('h1, h2, h3, .headline')
        if headlines:
            headline = random.choice(headlines[:5])
            box = await headline.bounding_box()
            if box:
                await human_like_mouse_movement(page, box['x'] + 10, box['y'] + box['height']/2)
                await page.wait_for_timeout(random.randint(800, 1500))  
    except:
        pass

async def simulate_human_click_preparation(page, button):
    """点击前的拟人化准备动作 - 优化版"""
    await simulate_reading_behavior(page)
    
    await button.scroll_into_view_if_needed()
    await page.wait_for_timeout(random.randint(500, 1000))  
    
    box = await button.bounding_box()
    if not box:
        return False
    
    button_x = box['x'] + box['width'] / 2
    button_y = box['y'] + box['height'] / 2
    
    search_areas = [
        (button_x + random.randint(-100, 100), button_y + random.randint(-50, 50)),
        (button_x + random.randint(-50, 50), button_y + random.randint(-25, 25)),
    ]
    
    for search_x, search_y in search_areas:
        await human_like_mouse_movement(page, search_x, search_y)
        await page.wait_for_timeout(random.randint(300, 600))  
    
    await human_like_mouse_movement(page, button_x, button_y, search_areas[-1][0], search_areas[-1][1])
    
    await page.wait_for_timeout(random.randint(400, 800))  
    
    return True, button_x, button_y

async def perform_human_like_click(page, x, y):
    """执行拟人化的点击"""
    jitter_x = x + random.randint(-2, 2)
    jitter_y = y + random.randint(-2, 2)
    await page.mouse.move(jitter_x, jitter_y)
    
    click_delay = random.randint(80, 200)
    await page.mouse.click(jitter_x, jitter_y, delay=click_delay)

async def collect_and_save_links(page, click_count, status):
    """收集并保存当前所有链接到同一个文件"""
    elements = await page.query_selector_all('a')
    urls = set()
    for a in elements:
        href = await a.get_attribute('href')
        if href and href.startswith('/news/articles'):
            urls.add("https://www.bloomberg.com" + href)
    
    data = {
        "last_updated": time.strftime("%Y-%m-%d %H:%M:%S"),
        "clicks_completed": click_count,
        "total_articles": len(urls),
        "articles": sorted(urls)
    }
    
    filename = "/Users/wangbaihui/bloomberg_econ_links.json"
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)
    

    log_progress(status, click_count, len(urls), f"💾 Saved {len(urls)} articles to {filename}")
    
    return len(urls)

async def scrape_bloomberg():
    status = ScraperStatus()
    
    try:
        async with async_playwright() as p:
            log_action(status, "Launching browser...", "Browser Setup")
            
            browser = await p.chromium.launch(headless=False)
            
            context = await browser.new_context(
                user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36",
                viewport={'width': 1280, 'height': 800},
                locale='en-US',
                timezone_id='America/New_York',
                extra_http_headers={
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'DNT': '1',
                    'Upgrade-Insecure-Requests': '1',
                    'Sec-Fetch-Dest': 'document',
                    'Sec-Fetch-Mode': 'navigate',
                    'Sec-Fetch-Site': 'none',
                    'Sec-Fetch-User': '?1',
                }
            )
            
            await context.add_init_script("""
                // 移除 webdriver 属性
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                });
                
                // 模拟真实的插件
                Object.defineProperty(navigator, 'plugins', {
                    get: () => [1, 2, 3, 4, 5]
                });
                
                // 模拟真实的语言设置
                Object.defineProperty(navigator, 'languages', {
                    get: () => ['en-US', 'en']
                });
                
                // 覆盖权限查询
                const originalQuery = window.navigator.permissions.query;
                window.navigator.permissions.query = (parameters) => (
                    parameters.name === 'notifications' ?
                        Promise.resolve({ state: Notification.permission }) :
                        originalQuery(parameters)
                );
            """)
            
            page = await context.new_page()
            
            log_action(status, "Loading Bloomberg Economics page...", "Page Loading")
            await page.goto("https://www.bloomberg.com/politics")
            await page.wait_for_timeout(random.randint(2000, 3000))  
            
            log_action(status, "Quick page exploration...", "Initial Exploration")
            
            
            for initial_round in range(random.randint(1, 2)):  
                log_action(status, f"Initial browsing round {initial_round + 1}")
                await simulate_reading_behavior(page)
                await page.wait_for_timeout(random.randint(1000, 2000))  
            
            log_action(status, "Getting familiar with page layout...")
            await check_page_sections(page)
            await page.wait_for_timeout(random.randint(800, 1500))  
            
            log_action(status, "Looking for terms and conditions...", "Terms & Conditions")
            try:
                accept_button = await page.wait_for_selector('button:has-text("Accept")', timeout=5000)  
                if accept_button:
                    log_action(status, "Found Accept button, accepting...")
                    
                    await page.wait_for_timeout(random.randint(1000, 2000)) 
                    
                   
                    box = await accept_button.bounding_box()
                    if box:
                        button_x = box['x'] + box['width'] / 2
                        button_y = box['y'] + box['height'] / 2
                        await human_like_mouse_movement(page, button_x, button_y)
                        await page.wait_for_timeout(random.randint(300, 600))  
                        
                        await perform_human_like_click(page, button_x, button_y)
                        await page.wait_for_timeout(random.randint(1000, 2000))  
                        log_action(status, "Terms and conditions accepted")
            except:
                log_action(status, "No Accept button found, continuing...")
            
            
            log_action(status, "Starting content exploration...", "Content Exploration")
            await simulate_reading_behavior(page)
            await page.wait_for_timeout(random.randint(1000, 2000))  
            
            max_clicks = 300
            consecutive_clicks = 0
            actual_clicks = 0  
            
            
            log_action(status, "Collecting initial articles...", "Data Collection")
            initial_count = await collect_and_save_links(page, 0, status)
            log_action(status, f"Initial page loaded with {initial_count} articles")
            
            
            status.update(phase="Article Collection")
            for i in range(max_clicks):
                try:
                    
                    log_action(status, f"Preparing for Load More click {i+1}/{max_clicks}...")
                    status.update(consecutive=consecutive_clicks)
                    
                    
                    if consecutive_clicks >= 2:  
                        log_action(status, "Adding human behavior...")
                        
                        
                        await simulate_reading_behavior(page)
                        await page.wait_for_timeout(random.randint(1000, 2000)) 
                        
                        
                        if random.random() < 0.3:  
                            action = random.choice(['End', 'PageDown'])  
                            log_action(status, f"Keyboard navigation: {action}")
                            await page.keyboard.press(action)
                            await page.wait_for_timeout(random.randint(800, 1500))  
                    
        
                    if consecutive_clicks >= 4:  
                        log_action(status, "Quick exploration...")
                        
                        
                        exploration = random.choice([
                            lambda: page.mouse.wheel(0, random.randint(-400, -200)),
                            lambda: quick_scan_page(page),
                        ])
                        await exploration()
                        await page.wait_for_timeout(random.randint(800, 1500)) 
                    
                    
                    log_action(status, "Searching for Load More button...")
                    await page.wait_for_selector('button[aria-label="more stories"]', timeout=10000)  
                    button = await page.query_selector('button[aria-label="more stories"]')
                    if not button:
                        log_action(status, "No more Load More button found", "Completion")
                        break
                    
                    
                    log_action(status, "Preparing click...")
                    success, x, y = await simulate_human_click_preparation(page, button)
                    if not success:
                        log_action(status, "Failed to prepare click ")
                        break
                    
                    
                    old_count = len(await page.query_selector_all('section#archive_story_list a'))
                    
                    
                    log_action(status, f"Clicking Load More {i+1}...")
                    await perform_human_like_click(page, x, y)
                    consecutive_clicks += 1
                    actual_clicks = i + 1
                    
                    
                    log_action(status, "Waiting for new content...")
                    try:
                        await page.wait_for_function(
                            f'document.querySelectorAll("section#archive_story_list a").length > {old_count}',
                            timeout=15000 
                        )
                        log_action(status, "New content loaded")
                    except:
                        log_action(status, "Timeout waiting for content, continuing...")
                    
            
                    log_action(status, "Post-click behavior...")
                    await page.wait_for_timeout(random.randint(800, 1500)) 
                    
                   
                    await page.mouse.wheel(0, random.randint(100, 200))
                    await page.wait_for_timeout(random.randint(500, 1000))  
                    
                    
                    current_count = await collect_and_save_links(page, actual_clicks, status)
                    
                    
                    if consecutive_clicks >= random.randint(3, 6):  
                        log_action(status, f"Taking break after {consecutive_clicks} clicks...", "Rest Period")
                        consecutive_clicks = 0
                        
                
                        break_duration = random.randint(2, 4) 
                        
                        for break_activity in range(break_duration):
                            log_action(status, f"Break activity {break_activity + 1}/{break_duration}")
                            await simulate_reading_behavior(page)
                            await page.wait_for_timeout(random.randint(1500, 3000))  
                        
                        
                        if random.random() < 0.2:  
                            log_action(status, "Quick navigation...")
                            await page.keyboard.press('Control+Home')
                            await page.wait_for_timeout(random.randint(2000, 4000))  
                            await page.keyboard.press('Control+End')
                            await page.wait_for_timeout(random.randint(1000, 2000)) 
                        
                        log_action(status, "Break completed, resuming...", "Article Collection")
                        status.update(consecutive=0)
                    
                    
                    await page.wait_for_timeout(random.randint(1500, 3000))
                    
                except Exception as e:
                    log_action(status, f"Load More click {i+1} failed: {str(e)[:50]}...")
                    await page.screenshot(path=f"click_error_{i+1}.png", full_page=True)
                    await collect_and_save_links(page, actual_clicks, status)
                    break
             
            log_action(status, "Creating final summary...", "Completion")
            final_count = await collect_and_save_links(page, actual_clicks, status)
            log_action(status, f"Scraping completed! Final: {final_count} articles after {actual_clicks} clicks")
            
            await browser.close()
            
    except KeyboardInterrupt:
        log_action(status, "Scraper stopped by user (Ctrl+C)", "User Stopped")
        return
    except Exception as e:
        log_action(status, f"Unexpected error: {str(e)}", "Error")
        return

await scrape_bloomberg()

🕷️  Bloomberg Economics Scraper - Enhanced Human-like Version
⏱️  Runtime: 00:16:52
🎯  Current Phase: Completion
🔄  Clicks Completed: 51
📊  Total Articles: 494
🔗  Consecutive Clicks: 4
📝  Current Action: Scraping completed! Final: 494 articles after 51 clicks ✅
📈  Avg Articles/Click: 9.7
⚡  Performance: 3.0 clicks/minute

💻 Environment: Jupyter Notebook
💡 Tip: The scraper is running human-like behaviors to avoid detection

