In [1]:
import pandas as pd
import numpy as np
import requests
import requests_cache
import time
import lxml.html as lx
import itertools

requests_cache.install_cache("new_cache")


In [2]:
states = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]
states = [state.replace(' ', '+') for state in states]
jobs = ["data+scientist", "data+analyst", "data+engineer"]

In [3]:
def scrape_front_page_indeed(base_url):
    """
    Scrape one page of Indeed by getting links to all job postings on the page, and providing the link to the next page
    Input: Url of the page you wish to scrape
    Output: If a blank string was passed for url, then we exhausted all pages so return None.
            If a regular url was passed, then a tuple with the the list of links as the first element, 
            and the link to the next page as the second element is returned 
    """
    if base_url == '':
        return None
    else:
        response = requests.get(base_url)
        response.raise_for_status()
        html = lx.fromstring(response.text)
        html.make_links_absolute(base_url)

        # get links to each posting
        links = []
        
        sponsered_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div/div/a")
        sponsered_links = [tag.attrib['href'] for tag in sponsered_tags][1:]
        for link in sponsered_links:
            links.append(link)
        
        reg_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div/h2/a")
        reg_links = [link.attrib['href'] for link in reg_tags]
        for link in reg_links:
            links.append(link)

        # get links to the next page
        nav_tags = html.xpath("//td[contains(@id, 'resultsCol')]/div[contains(@class, 'pagination')]/a/span[contains(@class,'pn')]/span[contains(@class,'np')]/../..")
        if len(nav_tags) == 2:
            # previous page, next page --> take next page
            next_url = nav_tags[1].attrib['href']
        elif len(nav_tags) == 0:
            # only one page of search results
            next_url = ''
        else:
            # could be either only previous or only next --> return '' if only previous, return link if next
            nav_texts = nav_tags[0].text_content()
            if 'Previous' in nav_texts:
                next_url = ''
            else:
                next_url = nav_tags[0].attrib['href']

        time.sleep(0.1)
        
        return list(set(links)), next_url

In [4]:
def get_all_links(first_page_url):
    """
    Get all the links from one search specification.
    Input: url of the first page returned from the search
    Output: list of all the links to job postings from that search
    """
    # start with scraping the first page
    links, next_url = scrape_front_page_indeed(first_page_url)
    
    # initialize posting list
    postings = [link for link in links]
    
    # recall function until you reach the last page, updating posting list each time
    while next_url != '':
        new_links, next_url = scrape_front_page_indeed(next_url)
        for link in new_links:
            postings.append(link)
    
    return list(set(postings))

In [5]:
my_dict = {}
for comb in list(itertools.product(jobs, states)):
    my_dict[comb] = get_all_links('https://www.indeed.com/jobs?q={}&l={}'.format(comb[0], comb[1]))
my_dict

ParserError: Document is empty