In [None]:
# Perform scrolling to load more job listings
last_height = page.evaluate("document.body.scrollHeight")
while True:
    # Scroll to the bottom of the page
    page.evaluate("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for some time to allow content to load
    page.wait_for_timeout(1000)  # Adjust the timeout as needed

    # Calculate new scroll height and compare with last scroll height
    new_height = page.evaluate("document.body.scrollHeight")
    if new_height == last_height:
        # If heights are the same, it means the page has reached the end
        break
    last_height = new_height

In [None]:
from playwright.sync_api import sync_playwright
import json

def main():
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Set to False for debugging
        page = browser.new_page()
        base_url = "https://weworkremotely.com"

        try:
            # Navigate to the page with increased timeout
            page.goto(base_url, timeout=60000, wait_until="domcontentloaded")

            # Wait for job listings to load
            page.wait_for_selector("section.jobs article li.new-listing-container", timeout=60000)

            # Select all job listing elements
            job_elements = page.query_selector_all("section.jobs article li.new-listing-container")
            jobs = []

            for job_element in job_elements:
                try:
                    # Extract title
                    title_element = job_element.query_selector(".new-listing__header__title")
                    title = title_element.inner_text().strip() if title_element else "Unknown"

                    # Extract company link
                    company_link_element = job_element.query_selector("div.tooltip--flag-logo a")
                    company_path = company_link_element.get_attribute("href") if company_link_element else "Unknown"
                    company_link = f"{base_url}{company_path}" if company_path.startswith("/") else company_path

                    # Extract apply link
                    apply_link_element = job_element.query_selector("a[href^='/listings']")
                    apply_path = apply_link_element.get_attribute("href") if apply_link_element else "Unknown"
                    apply_link = f"{base_url}{apply_path}" if apply_path.startswith("/") else apply_path

                    # Extract company name
                    company_element = job_element.query_selector(".new-listing__company-name")
                    company = company_element.inner_text().strip() if company_element else "Unknown"

                    # Extract location
                    location_element = job_element.query_selector(".new-listing__company-headquarters")
                    location = location_element.inner_text().strip() if location_element else "Unknown"

                    # Extract job type
                    job_type_element = job_element.query_selector(".new-listing__categories__category")
                    job_type = job_type_element.inner_text().strip() if job_type_element else "Unknown"

                    # Open the apply link in a new page
                    new_page = browser.new_page()
                    new_page.goto(apply_link, timeout=60000, wait_until="domcontentloaded")

                    # Scrape details from the new page
                    description_element = new_page.query_selector(".lis-container__job__content")
                    description = description_element.inner_text().strip() if description_element else "No description available"

                    # Close the new page
                    new_page.close()

                    # Append job data including details from the new page
                    jobs.append({
                        "title": title,
                        "companyLink": company_link,
                        "applyLink": apply_link,
                        "company": company,
                        "location": location,
                        "jobType": job_type,
                        "description": description
                    })

                except Exception as e:
                    print(f"Error occurred while processing a job element: {str(e)}")
                    continue

        except Exception as e:
            print(f"Error occurred: {str(e)}")
            jobs = []

        finally:
            browser.close()

        # Output results in JSON format
        print(json.dumps(jobs, indent=2))

if __name__ == "__main__":
    main()


In [None]:
from playwright.async_api import async_playwright
import asyncio
import json

async def scrape_jobs():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Run in headless mode for speed
        page = await browser.new_page()
        base_url = "https://weworkremotely.com"

        try:
            # Navigate to the page with increased timeout
            await page.goto(base_url, timeout=60000, wait_until="domcontentloaded")

            # Wait for job listings to load
            await page.wait_for_selector("section.jobs article li.new-listing-container", timeout=60000)

            # Select all job listing elements
            job_elements = await page.query_selector_all("section.jobs article li.new-listing-container")
            jobs = []

            async def process_job_element(job_element):
                try:
                    # Extract title
                    title_element = await job_element.query_selector(".new-listing__header__title")
                    title = await title_element.inner_text() if title_element else "Unknown"
                    title = title.strip()

                    # Extract company link
                    company_link_element = await job_element.query_selector("div.tooltip--flag-logo a")
                    company_path = await company_link_element.get_attribute("href") if company_link_element else "Unknown"
                    company_link = f"{base_url}{company_path}" if company_path.startswith("/") else company_path

                    # Extract apply link
                    apply_link_element = await job_element.query_selector("a[href^='/listings']")
                    apply_path = await apply_link_element.get_attribute("href") if apply_link_element else "Unknown"
                    apply_link = f"{base_url}{apply_path}" if apply_path.startswith("/") else apply_link

                    # Extract company name
                    company_element = await job_element.query_selector(".new-listing__company-name")
                    company = await company_element.inner_text() if company_element else "Unknown"
                    company = company.strip()

                    # Extract location
                    location_element = await job_element.query_selector(".new-listing__company-headquarters")
                    location = await location_element.inner_text() if location_element else "Unknown"
                    location = location.strip()

                    # Extract job type
                    job_type_element = await job_element.query_selector(".new-listing__categories__category")
                    job_type = await job_type_element.inner_text() if job_type_element else "Unknown"
                    job_type = job_type.strip()

                    # Open the apply link in a new page
                    new_page = await browser.new_page()
                    await new_page.goto(apply_link, timeout=30000, wait_until="domcontentloaded")

                    # Scrape details from the new page
                    description_element = await new_page.query_selector(".listing-container .listing-header")
                    description = await description_element.inner_text() if description_element else "No description available"
                    description = description.strip()

                    # Close the new page
                    await new_page.close()

                    return {
                        "title": title,
                        "companyLink": company_link,
                        "applyLink": apply_link,
                        "company": company,
                        "location": location,
                        "jobType": job_type,
                        "description": description
                    }
                except Exception as e:
                    print(f"Error occurred while processing a job element: {str(e)}")
                    return None

            # Process job elements concurrently
            tasks = [process_job_element(job_element) for job_element in job_elements]
            jobs = await asyncio.gather(*tasks)
            jobs = [job for job in jobs if job is not None]

        except Exception as e:
            print(f"Error occurred: {str(e)}")
            jobs = []

        await browser.close()

        # Output results in JSON format
        print(json.dumps(jobs, indent=2))

if __name__ == "__main__":
    asyncio.run(scrape_jobs())


In [None]:
#scroll


from playwright.sync_api import sync_playwright
import json

def main():
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Set to False for debugging
        page = browser.new_page()
        base_url = "https://weworkremotely.com"

        try:
            # Navigate to the page with increased timeout
            page.goto(base_url, timeout=60000, wait_until="domcontentloaded")

            # Perform scrolling to load more job listings
            last_height = page.evaluate("document.body.scrollHeight")
            while True:
                # Scroll to the bottom of the page
                page.evaluate("window.scrollTo(0, document.body.scrollHeight);")

                # Wait for some time to allow content to load
                page.wait_for_timeout(2000)  # Adjust the timeout as needed

                # Calculate new scroll height and compare with last scroll height
                new_height = page.evaluate("document.body.scrollHeight")
                if new_height == last_height:
                    # If heights are the same, it means the page has reached the end
                    break
                last_height = new_height

            # Wait for job listings to load
            page.wait_for_selector("section.jobs article li.new-listing-container", timeout=60000)

            # Select all job listing elements
            job_elements = page.query_selector_all("section.jobs article li.new-listing-container")
            jobs = []

            for job_element in job_elements:
                try:
                    # Extract title
                    title_element = job_element.query_selector(".new-listing__header__title")
                    title = title_element.inner_text().strip() if title_element else "Unknown"

                    # Extract company link
                    company_link_element = job_element.query_selector("div.tooltip--flag-logo a")
                    company_path = company_link_element.get_attribute("href") if company_link_element else "Unknown"
                    company_link = f"{base_url}{company_path}" if company_path.startswith("/") else company_path

                    # Extract apply link
                    apply_link_element = job_element.query_selector("a[href^='/listings']")
                    apply_path = apply_link_element.get_attribute("href") if apply_link_element else "Unknown"
                    apply_link = f"{base_url}{apply_path}" if apply_path.startswith("/") else apply_path

                    # Extract company name
                    company_element = job_element.query_selector(".new-listing__company-name")
                    company = company_element.inner_text().strip() if company_element else "Unknown"

                    # Extract location
                    location_element = job_element.query_selector(".new-listing__company-headquarters")
                    location = location_element.inner_text().strip() if location_element else "Unknown"

                    # Extract job type
                    job_type_element = job_element.query_selector(".new-listing__categories__category")
                    job_type = job_type_element.inner_text().strip() if job_type_element else "Unknown"

                    # Create a new browser context for the apply link
                    with browser.new_context() as new_context:
                        new_page = new_context.new_page()
                        new_page.goto(apply_link, timeout=60000, wait_until="domcontentloaded")

                        # Scrape details from the new page
                        description_element = new_page.query_selector(".lis-container__job__content")
                        description = description_element.inner_text().strip() if description_element else "No description available"

                        # Close the new page
                        new_page.close()

                    # Append job data including details from the new page
                    jobs.append({
                        "title": title,
                        "companyLink": company_link,
                        "applyLink": apply_link,
                        "company": company,
                        "location": location,
                        "jobType": job_type,
                        "description": description
                    })

                except Exception as e:
                    print(f"Error occurred while processing a job element: {str(e)}")
                    continue

        except Exception as e:
            print(f"Error occurred: {str(e)}")
            jobs = []

        finally:
            browser.close()

        # Output results in JSON format
        print(json.dumps(jobs, indent=2))
        print(f"Total jobs found: {len(jobs)}")

if __name__ == "__main__":
    main()


In [None]:
from playwright.sync_api import sync_playwright
import json

def main():
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)  # Set to False for debugging
        page = browser.new_page()
        base_url = "https://weworkremotely.com"

        try:
            # Navigate to the page with increased timeout
            page.goto(base_url, timeout=60000, wait_until="domcontentloaded")

            # Wait for job listings to load
            page.wait_for_selector("section.jobs article li.new-listing-container", timeout=60000)

            # Select all job listing elements
            job_elements = page.query_selector_all("section.jobs article li.new-listing-container")
            jobs = []

            for job_element in job_elements:
                try:
                    # Extract title
                    title_element = job_element.query_selector(".new-listing__header__title")
                    title = title_element.inner_text().strip() if title_element else "Unknown"

                    # Extract company link
                    company_link_element = job_element.query_selector("div.tooltip--flag-logo a")
                    company_path = company_link_element.get_attribute("href") if company_link_element else "Unknown"
                    company_link = f"{base_url}{company_path}" if company_path.startswith("/") else company_path

                    # Extract apply link
                    apply_link_element = job_element.query_selector("a[href^='/listings']")
                    apply_path = apply_link_element.get_attribute("href") if apply_link_element else "Unknown"
                    apply_link = f"{base_url}{apply_path}" if apply_path.startswith("/") else apply_path

                    # Extract company name
                    company_element = job_element.query_selector(".new-listing__company-name")
                    company = company_element.inner_text().strip() if company_element else "Unknown"

                    # Extract location
                    location_element = job_element.query_selector(".new-listing__company-headquarters")
                    location = location_element.inner_text().strip() if location_element else "Unknown"

                    # Extract job type
                    job_type_element = job_element.query_selector(".new-listing__categories__category")
                    job_type = job_type_element.inner_text().strip() if job_type_element else "Unknown"

                    # Create a new browser context for the apply link
                    with browser.contexts[0].browser.new_context() as new_context:
                        new_page = new_context.new_page()
                        new_page.goto(apply_link, timeout=60000, wait_until="domcontentloaded")

                        # Scrape details from the new page
                        description_element = new_page.query_selector(".lis-container__job__content")
                        description = description_element.inner_text().strip() if description_element else "No description available"

                        # Close the new page
                        new_page.close()

                    # Append job data including details from the new page
                    jobs.append({
                        "title": title,
                        "companyLink": company_link,
                        "applyLink": apply_link,
                        "company": company,
                        "location": location,
                        "jobType": job_type,
                        "description": description
                    })

                except Exception as e:
                    print(f"Error occurred while processing a job element: {str(e)}")
                    continue

        except Exception as e:
            print(f"Error occurred: {str(e)}")
            jobs = []

        finally:
            browser.close()

        # Output results in JSON format
        print(json.dumps(jobs, indent=2))
        with open("jobs scraped from weworkremotely.json", "w") as f:
            json.dump(jobs, f, indent=2)
        print(f"Total jobs found: {len(jobs)}")

if __name__ == "__main__":
    main()


In [None]:
# grok


import json
import time
from playwright.sync_api import sync_playwright, TimeoutError
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def retry_operation(operation, max_attempts=3, delay=2):
    """Retry a Playwright operation with exponential backoff."""
    for attempt in range(max_attempts):
        try:
            return operation()
        except TimeoutError as e:
            if attempt == max_attempts - 1:
                raise e
            time.sleep(delay * (2 ** attempt))
            logger.warning(f"Retrying operation (attempt {attempt + 1}/{max_attempts})")

def scrape_job_details(page, job_data, base_url):
    """Scrape job details from the apply link in a new tab."""
    try:
        # Open new tab
        new_page = page.context.new_page()
        
        # Navigate to apply link with retry
        retry_operation(lambda: new_page.goto(
            job_data['applyLink'], 
            timeout=30000, 
            wait_until="domcontentloaded"
        ))
        
        # Scrape description
        description = retry_operation(lambda: new_page.query_selector(
            ".lis-container__job__content"
        ))
        job_data['description'] = description.inner_text().strip() if description else "No description available"
        
        return job_data
    except Exception as e:
        logger.error(f"Error scraping details for {job_data['title']}: {str(e)}")
        job_data['description'] = "Error retrieving description"
        return job_data
    finally:
        new_page.close()

def main():
    jobs = []
    base_url = "https://weworkremotely.com"
    
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)  # Headless for production
        context = browser.new_context(
            viewport={'width': 1280, 'height': 720},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        page = context.new_page()

        try:
            # Navigate to main page with retry
            logger.info("Navigating to main page")
            retry_operation(lambda: page.goto(
                base_url, 
                timeout=30000, 
                wait_until="domcontentloaded"
            ))

            # Wait for job listings
            retry_operation(lambda: page.wait_for_selector(
                "section.jobs article li.new-listing-container", 
                timeout=30000
            ))

            # Extract job listings
            job_elements = page.query_selector_all("section.jobs article li.new-listing-container")
            logger.info(f"Found {len(job_elements)} job listings")

            # Prepare job data
            for job_element in job_elements:
                try:
                    title = job_element.query_selector(".new-listing__header__title")
                    company_link = job_element.query_selector("div.tooltip--flag-logo a")
                    apply_link = job_element.query_selector("a[href^='/listings']")
                    company = job_element.query_selector(".new-listing__company-name")
                    location = job_element.query_selector(".new-listing__company-headquarters")
                    job_type = job_element.query_selector(".new-listing__categories__category")

                    job_data = {
                        "title": title.inner_text().strip() if title else "Unknown",
                        "companyLink": f"{base_url}{company_link.get_attribute('href')}" 
                            if company_link and company_link.get_attribute('href').startswith("/") 
                            else company_link.get_attribute('href') if company_link else "Unknown",
                        "applyLink": f"{base_url}{apply_link.get_attribute('href')}" 
                            if apply_link and apply_link.get_attribute('href').startswith("/") 
                            else apply_link.get_attribute('href') if apply_link else "Unknown",
                        "company": company.inner_text().strip() if company else "Unknown",
                        "location": location.inner_text().strip() if location else "Unknown",
                        "jobType": job_type.inner_text().strip() if job_type else "Unknown",
                        "description": ""  # Will be filled later
                    }
                    jobs.append(job_data)
                except Exception as e:
                    logger.error(f"Error processing job element: {str(e)}")
                    continue

            # Scrape job details in parallel
            max_workers = min(len(jobs), 5)  # Limit concurrent tabs
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_job = {
                    executor.submit(scrape_job_details, page, job_data, base_url): job_data 
                    for job_data in jobs
                }
                
                for future in as_completed(future_to_job):
                    try:
                        job_data = future.result()
                        time.sleep(0.5)  # Rate limiting
                    except Exception as e:
                        logger.error(f"Error in parallel processing: {str(e)}")

        except Exception as e:
            logger.error(f"Fatal error occurred: {str(e)}")
            jobs = []
        
        finally:
            context.close()
            browser.close()

    # Save results
    output_file = "jobs_scraped_from_weworkremotely.json"
    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(jobs, f, indent=2, ensure_ascii=False)
    logger.info(f"Saved {len(jobs)} jobs to {output_file}")

if __name__ == "__main__":
    main()