In [2]:
import pandas as pd
from curl_cffi import requests as cureq
from bs4 import BeautifulSoup
import re
import os

In [3]:
from pydantic import BaseModel
from typing import List
import time

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}

class JobListing(BaseModel):
    jobLink: List[str]
    jobTitle: List[str]
    jobCompany: List[str]
    minSalary: List[str]
    maxSalary: List[str]
    jobDetails: List[str]
    jobLocation: List[str]

def extract_text(element, default='None'):
    """Helper function to extract text or return a default value."""
    return element.get_text().strip() if element else default

def extract_salary(job_salary):
    """Helper function to split and clean salary into min and max."""
    if job_salary != 'Not Specified' and len(job_salary.split(' ')) > 2:
        min_salary = job_salary.split('-')[0].replace('$', '').strip()
        max_salary = job_salary.split('-')[1].split(' ')[1].replace('$', '').strip()
        return min_salary, max_salary
    return 'None', 'None'

def pull_job_details(resp):
    job_list = {'jobLink':[],'jobTitle':[],'jobCompany':[],'minSalary':[],'maxSalary':[],'jobDetails':[],'jobLocation':[]}

    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')

        outer_most_point=soup.find('div',attrs={'id': 'mosaic-provider-jobcards'})

        for job in outer_most_point.find('ul'):
            a = job.find('a')
            if not a:
                continue

            href_link = a.get('href')
            job_link = 'https://www.indeed.com' + href_link
            job_list['jobLink'].append(job_link)

            job_salary,job_description = pull_job_desc(job_link)
 
            min_salary, max_salary = extract_salary(job_salary)

            job_list['minSalary'].append(min_salary)
            job_list['maxSalary'].append(max_salary)
            job_list['jobDetails'].append(job_description)

            job_list['jobTitle'].append(
                extract_text(job.find('span', id=lambda x: x and x.startswith('jobTitle-')))
            )
            job_list['jobCompany'].append(
                extract_text(job.find('span', {'data-testid': 'company-name'}))
            )
            job_list['jobLocation'].append(
                extract_text(job.find('div', {'data-testid': 'text-location'}))
            )

    return job_list

def pull_job_desc(job_link):
    resp = cureq.get(job_link,impersonate='chrome')

    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:

        soup = BeautifulSoup(resp.text,'html.parser')
        outer_most_points = soup.find('div',class_=re.compile(r'^fastviewjob'))

        raw_salary = outer_most_points.find('div',attrs={'id':'salaryInfoAndJobType'})
        salary = 'Not Specified'
        
        if raw_salary:
            salary = raw_salary.get_text()
        
        raw_description = outer_most_points.find('div',attrs={'id':'jobDescriptionText'})
        description = 'None'
        
        if raw_description:
            description = raw_description.get_text().replace('\n','')
        
    return salary,description

def format_search(search):
    return search.replace(' ','+')

def new_session():
    session = cureq.Session(impersonate="chrome",proxy=os.getenv("stickyproxy"))
    return session

def search_api(session: cureq.Session, job_title: str, location:str, start_num: int):
    url = f"https://www.indeed.com/jobs?q={format_search(job_title)}&l={format_search(location)}%2C++CA&start={str(start_num)}"
    resp = session.get(url,headers=headers)
    resp.raise_for_status()
    return JobListing(**pull_job_details(resp))

In [20]:
session = cureq.Session(impersonate="chrome",proxy=os.getenv("stickyproxy"))

In [21]:
def check_expired_job(session,job_url:str) -> bool:
    """Search for any div with relevant text indicating expiration"""
    response = session.get(job_url,impersonate='chrome')
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Search for any div with relevant text indicating expiration
        expired_message = soup.find(lambda tag: tag.name == "div" and 
                                    "expired" in tag.get_text())
        if expired_message:
            return True
        return False
    else:
        print(f'Unable to get link {response}')
        return False

In [22]:
job_url = 'https://www.indeed.com/viewjob?jk=da9280a719bdfee6&xpse=SoD267I363WQAOxjGZ0LbzkdCdPP&xfps=e44665dd-6eb2-4465-acb4-6b1aa567d129&xkcb=SoAt67M36galwHAOyD0DbzkdCdPP&vjs=3'
print(check_expired_job(session,job_url))

<div id="viewJobSSRRoot"><div class="mosaic mosaic-empty-zone" id="mosaic-aboveViewjobNav"></div><style data-emotion="css r07ztj">.css-r07ztj{box-sizing:border-box;margin:0;min-width:0;margin:1rem;overflow-wrap:break-word;}.css-r07ztj #mosaic-provider-company-info-salary{display:none;}.css-r07ztj h1{font-size:1.5rem;}</style><div class="fastviewjob jobsearch-ViewJobLayout--standalone css-r07ztj eu4oa1w0" role="main"><div class="css-amnpyw e37uo190"></div><div class="css-8ua0kf eu4oa1w0"><div class="css-1xwak0u eu4oa1w0"><div class="css-jr3hje eu4oa1w0"><form action="/jobs" class="css-z48huh e37uo190" method="get"><style data-emotion="css 1fr7b65">.css-1fr7b65{position:relative;width:100%;margin-right:0.5rem;display:-webkit-box;display:-webkit-flex;display:-ms-flexbox;display:flex;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:justify;-webkit-justify-content:space-between;justify-content:space-between;position:relative;z-ind

### Use loop to pull all current job listings with a search query

In [4]:
df = pd.DataFrame()
for i in range(0,1):
    listing = search_api(new_session(),'data+engineer','mountain+view',i*10)
    df = pd.concat([df,pd.DataFrame(listing.dict())],ignore_index=True)
    time.sleep(2)

In [6]:
display(df.head())

Unnamed: 0,jobLink,jobTitle,jobCompany,minSalary,maxSalary,jobDetails,jobLocation
0,https://www.indeed.com/rc/clk?jk=91f171f8260d5...,Data Engineer,Analog Devices,97060,133458,Come join Analog Devices (ADI) – a place whe...,"San Jose, CA 95134"
1,https://www.indeed.com/rc/clk?jk=5730c9f7bbab0...,Associate Data Engineer,Analog Devices,78200,107525,Come join Analog Devices (ADI) – a place whe...,"San Jose, CA 95134"
2,https://www.indeed.com/rc/clk?jk=6ea72a37e1fce...,AWS Data Engineer,Intellyk,119792,130830,Job Title: AWS Data EngineerLocation: Santa Cl...,"Santa Clara, CA 95050"
3,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,AWS Data Engineer,Siri InfoSolutions Inc,119792,130830,Job title:- AWS Data engineerLocation: Santa C...,"Santa Clara, CA"
4,https://www.indeed.com/rc/clk?jk=853c03f5baacd...,Analytics Engineer (L5) - Member Data Products,Netflix,170000,720000,Job Requisition ID JR30168 Job Posting...,"Los Gatos, CA"
