Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🙏 Guidance on how to integrate Botasaurus in an existing project #56

Closed
life-Nd opened this issue Feb 6, 2024 · 1 comment
Closed

Comments

@life-Nd
Copy link

life-Nd commented Feb 6, 2024

This is a great project but i am having issues integrating it in my existing code.
I was previously using the UndetectedChromeDriver and would like to replace it with Botasaurus.
The goals are to handle sign-in, get user profiles and complete some user flow (fill forms, upload documents and click buttons).
I have created classes to easily integrate each part in the program.
Here is the code for the helper class

import subprocess
import os
from pathlib import Path
import logging
# from os import path
# import random
from time import sleep
# import undetected_chromedriver as uc
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager


# from Tools.Bot.chrome_launcher_adapter import ChromeLauncherAdapter
# from Tools.Bot.create_stealth_driver import create_stealth_driver
from Tools.Bot.chrome_launcher_adapter import ChromeLauncherAdapter

from Tools.Bot.create_stealth_driver import create_stealth_driver
from selenium.webdriver.chrome.options import Options
from chromedriver_autoinstaller import install


from botasaurus import *
# from botasaurus_proxy_authentication import add_proxy_options


logger = logging.getLogger()
# COPIED FROM chrome-launcher code (https://github.com/GoogleChrome/chrome-launcher/blob/main/src/flags.ts), Mostly same but the extensions, media devices etc are not disabled to avoid detection
DEFAULT_FLAGS = [
    #   safe browsing service, upgrade detector, translate, UMA
    "--disable-background-networking",
    # Don't update the browser 'components' listed at chrome://components/
    "--disable-component-update",
    # Disables client-side phishing detection.
    "--disable-client-side-phishing-detection",
    # Disable syncing to a Google account
    "--disable-sync",
    # Disable reporting to UMA, but allows for collection
    "--metrics-recording-only",
    # Disable installation of default apps on first run
    "--disable-default-apps",
    # Disable the default browser check, do not prompt to set it as such
    "--no-default-browser-check",
    # Skip first run wizards
    "--no-first-run",
    # Disable backgrounding renders for occluded windows
    "--disable-backgrounding-occluded-windows",
    # Disable renderer process backgrounding
    "--disable-renderer-backgrounding",
    # Disable task throttling of timer tasks from background pages.
    "--disable-background-timer-throttling",
    # Disable the default throttling of IPC between renderer & browser processes.
    "--disable-ipc-flooding-protection",
    # Avoid potential instability of using Gnome Keyring or KDE wallet. crbug.com/571003 crbug.com/991424
    "--password-store=basic",
    # Use mock keychain on Mac to prevent blocking permissions dialogs
    "--use-mock-keychain",
    # Disable background tracing (aka slow reports & deep reports) to avoid 'Tracing already started'
    "--force-fieldtrials=*BackgroundTracing/default/",
    # Suppresses hang monitor dialogs in renderer processes. This flag may allow slow unload handlers on a page to prevent the tab from closing.
    "--disable-hang-monitor",
    # Reloading a page that came from a POST normally prompts the user.
    "--disable-prompt-on-repost",
    # Disables Domain Reliability Monitoring, which tracks whether the browser has difficulty contacting Google-owned sites and uploads reports to Google.
    "--disable-domain-reliability",
]




class BotasaurusChromeHandler:
    def __init__(self):
        print("💡 ChromeHandler init")
        sleep(5)
        self._driver = self.launch_chrome("https://ca.yahoo.com/?p=us", [])
        create_stealth_driver()
        print("✅ UndetectedChromeHandler launched ➡️ (🌈 Google.com)")

    def driver(self): 
        return self._driver

    
    # @browser(profile='Profile 1',)
    def launch_chrome(self,start_url, additional_args):
        # Set Chrome options
        chrome_options = Options(
            # headless=True,
            # add_argument(r"--user-data-dir=/Users/lifen/Library/Application Support/Google/Chrome/Profile 1"),
        )
        chrome_options.add_argument("--remote-debugging-port=9222")
        # chrome_options.add_argument("--no-sandbox")
        # chrome_options.add_argument("--disable-gpu")
        # chrome_options.add_argument("--disable-extensions")
        # chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--user-data-dir=/Users/lifen/Library/Application Support/Google/Chrome/Profile 1")
        # add_proxy_options(chrome_options)
        
        unique_flags = list(dict.fromkeys(DEFAULT_FLAGS + additional_args))

        kwargs = {
            "ignoreDefaultFlags": True,
            "chromeFlags": unique_flags,
            
            "userDataDir": "/Users/MacUser/Library/Application Support/Google/Chrome/Profile 1",

            "port": 9222,
            "headless": False,
            "autoClose": True,
        
        }

        if start_url:
            kwargs["startingUrl"] = start_url

        instance = ChromeLauncherAdapter.launch(**kwargs)
        return instance
    

Where the code is used:

import re
import logging
import random
from time import sleep
from configs.configs_model import ConfigsModel
from helpers.jobs_sql import JobsSQL
from helpers.html_page_handler import HTMLPageHandler
from helpers.shared import notification
from models.job_listing import JobListingModel

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.remote.webelement import WebElement

from helpers.botasaurus_chrome_handler import BotasaurusChromeHandler

from botasaurus import *



logger = logging.getLogger()


class IndeedChromeApplier:

    def __init__(self, jobs_sql: JobsSQL, jobs: list):
        print(f"💡 IndeedChromeApplier init ")
        self.jobs = jobs
        self.chrome = BotasaurusChromeHandler()
        # self.chrome.driver().maximize_window()
        driver = bt.create_driver()
        self.driver = driver
        self.page = HTMLPageHandler(driver=driver)
        self.jobs_sql = jobs_sql

    def get_uid(self):
        configs = ConfigsModel()
        uid = configs.user_id
        return uid

    # @browser
    def check_auth(self):
        # driver = self.chrome.driver()
        driver = self.driver
        driver.get("https://profile.indeed.com/")
        sleep(2)
        url = driver.current_url
        substring = "secure"
        print(f"🟢 🔴 {url=}")
        if substring in url:
            print("❌ Not Logged in")
            # Get input of the user to try again after he logs in
            notification(
                message="Please log in to Indeed.com and try again (y/n): ")
            _input = input("Please log in to Indeed.com and try again (y/n): ")
            _input: str = "" + _input
            if _input.lower().__contains__("y"):
                return self.check_auth()
            elif _input.lower().__contains__("n"):
                return False
            else:
                sleep(20000)
        elif "profile.indeed.com" in url:
            print("✅ Logged in")
            return True


    def answer_questions(self):
        # Define a WebDriverWait with a timeout of 10 seconds
        wait = WebDriverWait(self.chrome.driver(), 10)

        # Wait for the radio button for commuting/relocation to be clickable and select it
        try:
            commute_option: WebElement = wait.until(
                EC.element_to_be_clickable(
                    (
                        By.XPATH,
                        "//label[@for='input-q_38d8e685bb4b5228c2494ac85bc44d69-0']",
                    )
                )
            )
            commute_option.click()
            sleep(random.uniform(0.7, 2.2))
        except TimeoutException:
            print("Failed to find or click the commute option.")


    def replace_resume(self, job_title):
        print("⏯️  replace_resume")
        is_upload_resume = (
            "Upload or build a resume for this application"
            in self.chrome.driver().title
        )
        paths = self.get_paths()
        if is_upload_resume:
            print("✅ is_upload")
            # Find the "Replace" link using the full link text
            replace_link = self.page.try_find_element(
                driver=self.chrome.driver(),
                name="Replace",
                by=By.CSS_SELECTOR,
                value='[data-testid="ResumeFileInfoCardReplaceButton-button"]',
            )
            sleep(1)
            if replace_link:
                print("✅ replace_link")
                
                sleep(1)
                # Find the file input element
                file_input: WebElement = WebDriverWait(self.chrome.driver(), 10).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, 'input[type="file"]')
                    )
                )

                # Send the file path to the file input element
                file_input.send_keys(
                    f"{paths.output_resumes_pdf_dir}/RalphNduwimana-{job_title}.pdf"
                )
                sleep(random.uniform(0.9, 1.8))
                # self.page.click_to_next_page(name="Continue",by=By.CLASS_NAME,value='ia-continueButton ia-Resume-continue css-vw73h2 e8ju0x51')
                notification(message=f"Resume replaced by {job_title}")
                self.page.click_to_go_to_page(
                    name="Continue",
                    by=By.XPATH,
                    value="//div[contains(text(), 'Continue')]",
                )


    def submit_application(self):
        print("⏯️  review_application")
        notification(message="Reviewing application")
        sleep(1.7)
        notification(message="No cover letter required!")
      
        submit = self.page.click_to_go_to_page(
            name="Submit your application",
            by=By.XPATH,
            value="//button[contains(@class, 'ia-continueButton')]",
        )
        if submit:
            notification("Application Submitted")
        else:
            notification("Application Submitted", code=0)

        # submit_application_button.click()
        # Wait for 2 seconds for the submission to be completed
        sleep(2)

        # Check if the page contains "Application Submitted"
        application_submitted = (
            "Application Submitted" in self.chrome.driver().page_source
        )
        # Check if the submission was completed and return True if "Application Submitted" was found
        if application_submitted:
            notification("Application submitted successfully!")
            return True
        else:
            print("Application submission failed.")
            return False
    def click_button(self):
        # Logic to click on buttons 
        pass
    
    def type_text(self):
        # Logic to click on buttons 
        pass

    def run(self):
        print("⏯️  IndeedChromeApplier run")
        driver = self.chrome.driver()
        authenticated = self.check_auth()
        jobs_row = self.jobs_sql.load_jobs_by_status(query_status="Generated")
        jobs_data = [job_row for job_row in jobs_row]
        print(f'✅ ✅ {str(jobs_data)[0:200]}')

        if authenticated:
            for data in jobs_data:
                if not data:
                    print(f'🚫 No Data in jobs_data')
                job_data = self.convert_tuple_to_dict(data)
                job = JobListingModel(job_data)
                url = job.jobUrl
                print(f'✅ ✅ ✅ ✅ {job.jobUrl}')
                page_loaded = self.page.go_to_page(url)
                if not page_loaded:
                    print(f"🚫 {url} not loaded")
                    # continue

                if page_loaded:
                    print('✅ page_loaded')

                    application_started = self.page.click_to_go_to_page(
                        name="Apply",
                        by=By.ID,
                        value="indeedApplyButton",
                    )
                    data = re.search(
                        "This job has expired on Indeed",
                        driver.page_source,
                    )
                    # Get True of False
                    expired = data is not None
                    print(f"📕 {expired=}")
                    # sleep(10000)
                    sleep(random.uniform(0.2, 0.5))
                    if not application_started:
                        print("🚫 Application not started")
                        sleep(1000)
                    if "indeed" not in driver.current_url:
                        print("Cannot apply on company websites (just indeed.com)")
                        sleep(10000)

                    pages = {
                        "questions": False,
                        "resume": False,
                        "review": False,
                        "work-experience": False,
                        "submitted": False,
                    }

                    try:
                        # there is a page that has not been completed
                        while (
                            False
                            in pages.values()
                        ):
                            print('')

                    except NoSuchElementException:
                        print(
                            f"❌ Failed to get page ")

    def log_in(self, username, password):
        print(f"⏯️  Starting log_in {username} {password}")
        page = self.page
        try:
            username_bar = page.try_find_element(
                name="username_bar",
                by=By.ID,
                value="session_key",
                driver=self.driver,
            )
            assert username_bar is not None
            username_bar.send_keys(f"{username}")
            password_bar = page.try_find_element(
                name="password_bar", by=By.ID, value="session_password", driver=self.chrome.driver()
            )
            assert password_bar is not None
            password_bar.send_keys(f"{password}")
            password_bar.send_keys(Keys.ENTER)
            print("✅ User logged-in")
        except NoSuchElementException:
            print("No such element found")
        except Exception:
            print("Other exception")
        print(f"⏹️  Finished log_in {username} {password}")

    def log_out(self):
        url = self.chrome.driver().current_url
        print(f"⏯️  Starting log_out from {url}")
        xpath = (
            "/html/body/div[5]/header/div/nav/ul/li[6]/div/button"
            if "Home" in url
            else "/html/body/header/div/div[2]/div/div/button"
        )
        page = self.page
        icon_button = page.try_find_element(
            driver=self.chrome.driver(),
            name="Log-Out",
            by=By.XPATH,
            value=xpath,
            element_type="button",
        )
        try:
            print(f"{icon_button=}")
            try:
                sign_out_option: WebElement = WebDriverWait(
                    self.chrome.driver(), 10
                ).until(EC.presence_of_element_located((By.LINK_TEXT, "Sign Out")))
                sign_out_option.click()
                print("✅ User logged-out")
            except:
                print(f"Sign Out not found ")
        except:
            print("Avatar button not found")
        print(f"⏹️  Finished log_out from {url}")


I would appreciate any guidance on how to integrate Botasaurus features in my code.
Thanks in advance!!!

@life-Nd life-Nd changed the title Guidance on how to integrate Botasaurus in an existing project 🙏 Guidance on how to integrate Botasaurus in an existing project Feb 6, 2024
@Chetan11-dev
Copy link
Contributor

We do not provide dedicated support for individual problems. We recommend creating a detailed issue on Stack Overflow or in the /r/webscraping/ subreddit on Reddit, where the community can assist you.
We hope you understand.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants