In [39]:
import pandas as pd
from curl_cffi import requests as cureq
from bs4 import BeautifulSoup
import re
import os

In [149]:
from pydantic import BaseModel
from typing import List
import time

In [41]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}

class JobListing(BaseModel):
    jobLink: List[str]
    jobTitle: List[str]
    jobCompany: List[str]
    minSalary: List[str]
    maxSalary: List[str]
    jobDetails: List[str]
    jobLocation: List[str]

def pull_job_details(resp):
    job_list = {'jobLink':[],'jobTitle':[],'jobCompany':[],'minSalary':[],'maxSalary':[],'jobDetails':[],'jobLocation':[]}

    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')

        outer_most_point=soup.find('div',attrs={'id': 'mosaic-provider-jobcards'})

        for i in outer_most_point.find('ul'):
            a = i.find('a')
            if not a:
                continue

            href_link = a.get('href')
            job_list['jobLink'].append(href_link)

            job_link = 'https://www.indeed.com' + href_link
            job_salary,job_description = pull_job_desc(job_link)
 
            if job_salary != 'Not Specified' and len(job_salary.split(' ')) > 2:
                job_list['minSalary'].append(job_salary.split('-')[0].replace('$',''))
                job_list['maxSalary'].append(job_salary.split('-')[1].split(' ')[1].replace('$',''))
            else:
                job_list['minSalary'].append('None')
                job_list['maxSalary'].append('None')

            job_list['jobDetails'].append(job_description)

            raw_title = i.find('span',id=lambda x: x and x.startswith('jobTitle-'))
            if raw_title:
                job_list['jobTitle'].append(raw_title.get_text())
            else:
                job_list['jobTitle'].append('None')

            raw_company = i.find('span',{'data-testid':'company-name'})
            if raw_company:
                job_list['jobCompany'].append(raw_company.get_text())
            else:
                job_list['jobCompany'].append('None')

    return job_list

def pull_job_desc(job_link):
    resp = cureq.get(job_link,impersonate='chrome')

    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:

        soup = BeautifulSoup(resp.text,'html.parser')
        outer_most_points = soup.find('div',class_=re.compile(r'^fastviewjob'))

        raw_salary = outer_most_points.find('div',attrs={'id':'salaryInfoAndJobType'})
        salary = 'Not Specified'
        
        if raw_salary:
            salary = raw_salary.get_text()
        
        raw_description = outer_most_points.find('div',attrs={'id':'jobDescriptionText'})
        description = 'None'
        
        if raw_description:
            description = raw_description.get_text().replace('\n','')
        
    return salary,description

def format_search(search):
    return search.replace(' ','+')

def new_session():
    session = cureq.Session(impersonate="chrome",proxy=os.getenv("stickyproxy"))
    return session

def search_api(session: cureq.Session, job_title: str, location:str, start_num: int):
    url = f"https://www.indeed.com/jobs?q={format_search(job_title)}&l={format_search(location)}%2C++CA&start={str(start_num)}"
    resp = session.get(url,headers=headers)
    resp.raise_for_status()
    return JobListing(**pull_job_details(resp))

### Use loop to pull all current job listings with a search query

In [42]:
df = pd.DataFrame()
for i in range(0,3):
    listing = search_api(new_session(),'data+engineer','mountain+view',i*10)
    df = pd.concat([df,pd.DataFrame(listing.dict())],ignore_index=True)
    time.sleep(2)

In [155]:
def clean_output(text:str):
    text = text.lower()
    text = text.replace('\n','    ')
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    return text

def pull_skills(resp):
    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')
        print(soup)

        # Find all button elements with 'data-testid' that ends with '-tile'
        buttons = soup.find_all('button', {'data-testid': lambda x: x and x.endswith('-tile')})
        # Extract the tile name from the 'data-testid' attribute
        tile_names = [button['data-testid'].replace('-tile', '') for button in buttons]

        for name in tile_names:
            print(name)

def pull_job_data(resp):
    if 'text/html' in resp.headers['Content-Type'] and resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')

        desc = soup.find('div',{'id':'jobDescriptionTitle'})

        if not desc:
            print('No job description found')
            return None
        
        return desc.next_sibling.get_text()
        
def get_raw_text(resp):
    soup = BeautifulSoup(resp.text,'html.parser')
    return clean_output(soup.get_text())

def scrape_webpage_text(session: cureq.Session, job_link: str):
    url = f'https://www.indeed.com{job_link}'
    resp = session.get(url,headers=headers)
    
    return resp

### Update to pull job skills if necessary

In [156]:
for index,row in df.iterrows():
    resp = scrape_webpage_text(new_session(),row['jobLink'])
    desc = clean_output(pull_job_data(resp))
    df.loc[index, 'description'] = desc
    time.sleep(2)

In [157]:
df.head(5)

Unnamed: 0,jobLink,jobTitle,jobCompany,minSalary,maxSalary,jobDetails,description
0,/rc/clk?jk=91f171f8260d5452&bb=Eb6IIhKxqsmRoSA...,Data Engineer,Analog Devices,97060.0,133458.0,Come join Analog Devices (ADI) – a place whe...,come join analog devices adi a plac...
1,/rc/clk?jk=853c03f5baacd62c&bb=Eb6IIhKxqsmRoSA...,Analytics Engineer (L5) - Member Data Products,Netflix,170000.0,720000.0,Job Requisition ID JR30168 Job Posting...,job requisition id ...
2,/rc/clk?jk=5730c9f7bbab0b75&bb=Eb6IIhKxqsmRoSA...,Associate Data Engineer,Analog Devices,78200.0,107525.0,Come join Analog Devices (ADI) – a place whe...,come join analog devices adi a plac...
3,/rc/clk?jk=4f52cc770d2bb30d&bb=Eb6IIhKxqsmRoSA...,Data Engineer,UST Global,,,1 Opening San Jose Role de...,1 opening ...
4,/rc/clk?jk=95f7bd3fb1e18475&bb=Eb6IIhKxqsmRoSA...,Data Engineer,Stanford University,96000.0,140000.0,"(Full-time, 2 year contract, renewable)The Sta...",fulltime 2 year contract renewable the ...
