In [24]:
import random 

import time


from typing import List

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import quote_plus
from logger import get_logger
logger = get_logger('main')

from utils.selenium_utils import ScraperConfig
from utils.common import scroll_and_wait, load_and_scroll, sleeper, save_as, soup_maker

In [25]:


def extractor(soup: BeautifulSoup, base_url: str) -> List[dict]:
    job_list = soup.find('div', id='mosaic-provider-jobcards')
    if not job_list:
        return []

    jobs = job_list.find('ul').find_all('li')[1:]  # Skip the first empty card or header
    result = []

    for job in jobs:
        try:
            title_tag = job.find('h2')
            title = title_tag.text.strip() if title_tag else None

            company_tag = job.select_one('div.company_location span[data-testid="company-name"]')
            company_name = company_tag.text.strip() if company_tag else None

            job_type_tag = job.select_one('div[data-testid="text-location"] span')
            job_type = job_type_tag.text.strip() if job_type_tag else None

            extra_items = job.select('div[data-testid="jobsnippet_footer"] ul li')
            extra_data = [item.text.strip() for item in extra_items] if extra_items else []

            link_tag = title_tag.find('a') if title_tag else None
            link = f"{base_url}{link_tag['href']}" if link_tag and 'href' in link_tag.attrs else None

            if title and company_name and link:  # Minimum data check
                job_data = {
                    'title': title,
                    'company_name': company_name,
                    'job_type': job_type,
                    'extra_data': extra_data,
                    'link': link,
                }
                result.append(job_data)

        except Exception as e:
            logger.error(f"Error parsing job: {e}")
            continue

    return result


In [26]:
class IndeedScraper:
    def __init__(self, config: ScraperConfig):
        self.config = config
        self.driver = config.driver
        self.url = "https://www.indeed.com"
        logger.info("🚀 IndeedScraper initialized")

    def get_search_url(self, keyword: str) -> str:
        encoded = quote_plus(keyword)
        search_url = f"{self.url}/jobs?q={encoded}"
        logger.debug(f"🔗 Generated search URL: {search_url}")
        return search_url

    def scrape_search_results(self, keyword: str, wait_time=3) -> str:
        url = self.get_search_url(keyword)
        logger.info(f"🌐 Navigating to search page: {url}")
        try:
            self.driver.get(url)
            scroll_and_wait(self.driver)
            time.sleep(wait_time)
            logger.info("✅ Page loaded and ready for scraping")
            return self.driver.page_source
        except Exception as e:
            logger.error(f"❌ Error loading page for keyword '{keyword}': {e}")
            return ""
    
    def pagination(self, soup):
        logger.debug("📄 Checking for pagination...")

        try:
            # Find the "Next Page" button
            next_page = soup.find('a', attrs={"data-testid": "pagination-page-next", "aria-label": "Next Page"})

            if not next_page or not next_page.get("href"):
                logger.warning("⚠️ No next page link found")
                return None

            # Combine base URL and relative href
            next_url = urljoin(self.url, next_page["href"])

            logger.info(f"➡️ Found next page URL: {next_url}")
            return next_url

        except Exception as e:
            logger.error(f"❌ Pagination error: {e}")
            return None

    def scrape_all_pages(self, keyword: str, max_pages=5):
        results = []
        response = self.scrape_search_results(keyword)
        soup = soup_maker(response)
        page = 0
         
        while soup and page < max_pages:
            data = extractor(soup, self.url)
            results += data

            logger.info(f'📄 Page {page + 1} scraped.')

            next_page = self.pagination(soup)
            if not next_page:
                break
            
            self.driver.get(next_page)
            time.sleep(random.uniform(2, 4))  # can swap with sleeper()
            soup = soup_maker(self.driver.page_source)
            page += 1
        
        return results


    def quit(self):
        logger.info("🛑 Quitting WebDriver")
        self.driver.quit()


In [27]:

# config = ScraperConfig(use_scrapeops=True,use_seleniumwire=True)
config = ScraperConfig(use_uc=True)
indeed = IndeedScraper(config)



In [29]:
results =indeed.scrape_all_pages('software delveloper',1)

In [30]:
save_as(results,'x.json')