In [77]:
import pandas as pd
from curl_cffi import requests as cureq
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import os

In [4]:
from pydantic import BaseModel
from typing import List
import time

In [82]:
from zenrows import ZenRowsClient

In [87]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}

chrome_driver_path = '../Backend/utils/chromedriverX64'

class JobListing(BaseModel):
    jobLink: List[str]
    jobTitle: List[str]
    jobCompany: List[str]
    minSalary: List[str]
    maxSalary: List[str]
    jobDetails: List[str]
    jobLocation: List[str]

def extract_text(element, default='None'):
    """Helper function to extract text or return a default value."""
    return element.get_text().strip() if element else default

def extract_salary(job_salary):
    print(job_salary)
    """Helper function to split and clean salary into min and max."""
    if job_salary != 'Not Specified' and len(job_salary.split(' ')) > 2:
        min_salary = job_salary.split('-')[0].replace('$', '').strip()
        max_salary = job_salary.split('-')[1].split(' ')[1].replace('$', '').strip()
        return min_salary, max_salary
    return 'None', 'None'

def pull_job_details(resp,driver):
    job_list = {'jobLink':[],'jobTitle':[],'jobCompany':[],'minSalary':[],'maxSalary':[],'jobDetails':[],'jobLocation':[]}

    if resp:
        soup = BeautifulSoup(resp, 'html.parser')

        outer_most_point=soup.find('div',attrs={'id': 'mosaic-provider-jobcards'})

        for job in outer_most_point.find('ul'):
            a = job.find('a')
            if not a:
                continue

            href_link = a.get('href')
            job_link = 'https://www.indeed.com' + href_link
            job_list['jobLink'].append(job_link)

            time.sleep(5)
            params = {"js_render":"true"}
            temp = driver.get(job_link,params=params)

            job_salary,job_description = pull_job_desc(temp.text)

            min_salary, max_salary = extract_salary(job_salary)

            job_list['minSalary'].append(min_salary)
            job_list['maxSalary'].append(max_salary)
            job_list['jobDetails'].append(job_description)

            job_list['jobTitle'].append(
                extract_text(job.find('span', id=lambda x: x and x.startswith('jobTitle-')))
            )
            job_list['jobCompany'].append(
                extract_text(job.find('span', {'data-testid': 'company-name'}))
            )
            job_list['jobLocation'].append(
                extract_text(job.find('div', {'data-testid': 'text-location'}))
            )

    return job_list

def pull_job_desc(resp):

    salary = 'Not Specified'
    description = 'None'

    if resp:

        soup = BeautifulSoup(resp,'html.parser')
        outer_most_points = soup.find('div',class_=re.compile(r'^fastviewjob'))

        raw_salary = outer_most_points.find('div',attrs={'id':'salaryInfoAndJobType'})
        # Need another check that if not raw_salary than we need to look for -- > <button data-testid="$237,000 - $296,000 a year-tile" aria-label="Pay $237,000 - $296,000 a year missing preference" 
        
        if raw_salary:
            salary = raw_salary.get_text()
        
        raw_description = outer_most_points.find('div',attrs={'id':'jobDescriptionText'})
        
        
        if raw_description:
            description = raw_description.get_text().replace('\n','')
    
    return salary,description

def format_search(search):
    return search.replace(' ','+')

def new_session():
    session = cureq.Session(impersonate="chrome",proxy=os.getenv("stickyproxy"))
    return session

def search_api(driver, job_title: str, location:str, start_num: int):
    url = f"https://www.indeed.com/jobs?q={format_search(job_title)}&l={format_search(location)}%2C++CA&start={str(start_num)}"

    driver = ZenRowsClient("")
    params = {"js_render":"true"}
    resp = driver.get(url,params=params)
    
    return JobListing(**pull_job_details(resp.text,driver))

In [20]:
session = cureq.Session(impersonate="chrome",proxy=os.getenv("stickyproxy"))

In [6]:
def check_expired_job(session,job_url:str) -> bool:
    """Search for any div with relevant text indicating expiration"""
    response = session.get(job_url,impersonate='chrome')
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Search for any div with relevant text indicating expiration
        expired_message = soup.find(lambda tag: tag.name == "div" and 
                                    "expired" in tag.get_text())
        if expired_message:
            return True
        return False
    else:
        print(f'Unable to get link {response}')
        return False

### Use loop to pull all current job listings with a search query

In [88]:
df = pd.DataFrame()
for i in range(0,2):
    listing = search_api(new_session(),'data+engineer','mountain+view',i*10)
    df = pd.concat([df,pd.DataFrame(listing.dict())],ignore_index=True)
    time.sleep(2)

Not Specified
$146,830 - $188,100 a year -  Full-time
$130,000 - $250,000 a year -  Full-time
Not Specified
$206,000 - $258,000 a year -  Full-time
Not Specified
Not Specified
$118,900 - $205,600 a year -  Full-time
Not Specified
Not Specified
Not Specified
Not Specified
$159,520 - $169,378 a year -  Full-time
Not Specified
Contract
$97,060 - $133,458 a year -  Full-time
$145,000 - $355,000 a year
Full-time
Not Specified
$200,000 - $250,000 a year -  Full-time
$171,000 - $205,000 a year
Not Specified
$119,000 - $160,000 a year
$170,000 - $720,000 a year -  Full-time
Not Specified
$40 - $60 an hour -  Full-time, Contract
$175,800 - $312,200 a year -  Full-time
$174,300 - $295,000 a year
Not Specified
Contract


In [90]:
display(df)

Unnamed: 0,jobLink,jobTitle,jobCompany,minSalary,maxSalary,jobDetails,jobLocation
0,https://www.indeed.com/rc/clk?jk=c2d066606e0a5...,Data Engineer,FutureSoft IT,,,**Please Read**Local candidates only. This opp...,"Sunnyvale, CA 94043"
1,https://www.indeed.com/rc/clk?jk=bf7305fcc7d9d...,Data Engineer,Apple,146830.0,188100.0,"Summary Posted: Dec 10, 2024 Role Number:200...","Cupertino, CA"
2,https://www.indeed.com/rc/clk?jk=20ee22610251f...,Data Engineer,RIOS Intelligent Machines Inc.,130000.0,250000.0,About RIOSRIOS Intelligent Machines is transfo...,"Menlo Park, CA"
3,https://www.indeed.com/rc/clk?jk=eee32a1393f9e...,Vehicle Access Data Engineer,Rivian and VW Group Technology,,,About Us: Rivian and Volkswagen Group Tech...,"Palo Alto, CA 94304"
4,https://www.indeed.com/rc/clk?jk=79329e60a3a36...,"Staff Data Engineer, Audio",Rivian and VW Group Technology,206000.0,258000.0,About Us: Rivian and Volkswagen Group Tech...,"Palo Alto, CA 94304"
5,https://www.indeed.com/rc/clk?jk=5730c9f7bbab0...,Associate Data Engineer,Analog Devices,,,Come join Analog Devices (ADI) – a place whe...,"San Jose, CA 95134"
6,https://www.indeed.com/rc/clk?jk=e4390ff4d091c...,"Data Engineer, Analytics",Meta,,,"Are you passionate about Facebook’s product, ...","Menlo Park, CA 94025"
7,https://www.indeed.com/rc/clk?jk=b0476988446bb...,"Data Engineer , Amazon",Amazon.com Services LLC,118900.0,205600.0,3+ years of data engineering experienceExperie...,"Palo Alto, CA"
8,https://www.indeed.com/rc/clk?jk=8f677fdf56244...,Data Engineer,Wealthfront,,,Data is critical to Wealthfront’s success. ...,"Palo Alto, CA"
9,https://www.indeed.com/rc/clk?jk=381628cd924f3...,Data Engineer III,Walmart,,,"Company Description Fifty years ago, Sam Walto...","Sunnyvale, CA 94086 (West Murphy area)"
