In [1]:
# Import Libraries
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
import datetime
import csv
import pandas as pd

In [2]:
def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [3]:
class GreenhousePage:
    def __init__(self, slug):
        self.slug = slug
        self.run_date = datetime.date.today().strftime("%Y-%b-%d")  
        self.url = 'https://boards.greenhouse.io/' + self.slug
        self.load_soup()
        self.title = self.soup.find("meta", property = "og:title")['content']
        self.sections = []
        self.get_sections()
        self.jobs_data = [['category','job_title','page_link','job_id', 'location', 'run_date']]
        self.load_jobs()
        self.load_dataframe()
    
    def load_jobs(self):
        sec_divs = self.soup.findAll("section")
        for sec in sec_divs:
            
            all_links = sec.findAll('a', attrs={'data-mapped': 'true'})
            for link in all_links:
                category = link.find_previous("h3").text
                job_title = link.contents[0]
                page_link = 'https://boards.greenhouse.io' + link['href']
                job_id = link['href'][len(self.slug + "/jobs/") + 1:]
                location = link.find_next_sibling('span').text
                self.jobs_data.append([category, job_title, page_link, job_id, location, self.run_date])
    
    def load_dataframe(self):
        self.jobs = pd.DataFrame(self.jobs_data[1:], columns = self.jobs_data[0])
            
    def get_sections(self):
        if not self.soup:
            self.load_soup()
        sec_divs = self.soup.findAll("section")
        for sec in sec_divs:
            try:
                sec_title = sec.find("h3").text
                self.sections.append(sec_title)
            except:
                pass
            
    def load_soup(self):
        response = requests.get(self.url)
        self.soup = BeautifulSoup(response.text, 'html.parser')
        
    def print_html(self):
        print(self.soup.prettify())
    
    def to_csv(self):
        self.jobs.to_csv(self.run_date + '.csv')

In [4]:
# The base job page URL for Bernie 2020
#base_url = 'https://boards.greenhouse.io/bernie2020'
#base_url = 'https://boards.greenhouse.io/gitlab'

gitlab_page = GreenhousePage('gitlab')
bernie_page = GreenhousePage('bernie2020')

In [8]:
bernie_page.to_csv()

In [7]:
# Get all job links
# <div class = "opening"> ... <a href...""
soup = gitlab_page.soup
open_jobs = []
all_links = soup.findAll('a', attrs={'data-mapped': 'true'})
for link in all_links:
    category = '' #@TODO Find previous h3 tag
    job_title = link.contents[0]
    page_link = 'https://boards.greenhouse.io' + link['href']
    open_jobs.append({'job_title': job_title, 'link': page_link})
print(f"There are currently {len(open_jobs)} open jobs")

There are currently 192 open jobs


In [8]:
# Process open jobs and save text
keyword = 'sql'
final_results = {}
for job in open_jobs:
    job_title = job['job_title']
    link = job['link']
   
    search_response = requests.get(link)
    search_soup = BeautifulSoup(search_response.text, 'html.parser')
    results = search_soup.body.find_all(string = re.compile('.*{0}.*'.format(keyword), re.IGNORECASE), recursive = True)
    
    if len(results) > 0:
        final_results[link] = {'job_title': job_title, 'result_count': len(results)}
        print(f'Found the word "{keyword}" {len(results)} times in {job_title} ({link})')
                
print(final_results)

Found the word "sql" 3 times in Operations Analyst, Development (https://boards.greenhouse.io/gitlab/jobs/4371373002)
Found the word "sql" 3 times in Backend Engineer, Database (https://boards.greenhouse.io/gitlab/jobs/4473989002)
Found the word "sql" 2 times in Backend Engineer, Search (https://boards.greenhouse.io/gitlab/jobs/4392358002)
Found the word "sql" 2 times in Senior Security Engineer, Abuse (https://boards.greenhouse.io/gitlab/jobs/4489255002)
Found the word "sql" 2 times in Finance Business Partner, Sales (https://boards.greenhouse.io/gitlab/jobs/4500567002)
Found the word "sql" 2 times in Business Systems Analyst (https://boards.greenhouse.io/gitlab/jobs/4450334002)
Found the word "sql" 5 times in Data Analyst (https://boards.greenhouse.io/gitlab/jobs/4416681002)
Found the word "sql" 5 times in Data Analyst (https://boards.greenhouse.io/gitlab/jobs/4441205002)
Found the word "sql" 5 times in Data Engineer (Product/Engineering) (https://boards.greenhouse.io/gitlab/jobs/451