# Web Scraping Indeed.com for job IDs, salary threshold

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def get_ids(salary_range_divider = 90000):
    # OPTIONAL: removes low values
    if salary_range_divider <= 20000:
        return(print("Enter a number larger than $20,000."))
        
    job_ids = pd.DataFrame()
    # Set dividing salaries
    divider_strings = ["+$20000-{}".format(salary_range_divider), "+${}".format(salary_range_divider)]
    
    # Perform two searches, starting with the low-salary jobs
    for level, salary_criterion in enumerate(divider_strings):
        
        result_list = []
        
        # Find the number of results
        URL_for_count = "http://www.indeed.com/jobs?q=data+scientist" + salary_criterion + "&l=Boston"
        soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

        results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
        number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

        # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
        i = int(number_of_results/100)
        for page_number in range(i + 1):
            URL_for_results = "http://www.indeed.com/jobs?q=data+scientist{}&l=Boston&limit=100&start={}".format(salary_criterion, str(100 * page_number))
            soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
            results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})
            
            # Extract the ID for each job listing, and include the 'salary_over_X' level
            for x in results:
                result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], level])
        
        # Add the job ID numbers
        job_ids = job_ids.append(result_list)
    
    # Rename job_ids's columns
    job_ids.columns = ['job_id', "salary_over_{}".format(salary_range_divider)]
    
    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    
    return(job_ids)

In [4]:
id_dataframe = get_ids(90000)

In [10]:
id_dataframe

Unnamed: 0,job_id,salary_over_90000
0,jl_85c4e91c561780aa,0
1,jl_e632343c455d80f9,0
2,jl_1b5a168dfc7b2712,0
3,jl_05654b2739edb3dc,0
4,jl_1a7766c45b1abbeb,0
5,jl_54c0d6b8ad9f9e3d,0
6,jl_d8ed0aa611a75e92,0
7,jl_d5e17d142783f070,0
8,jl_b3462acabd0f43c3,0
9,jl_28643ddfd474331d,0


In [17]:
def count_results(query_string = None):
    
    job_ids = pd.DataFrame()
    
    # Perform two searches, starting with the low-salary jobs
    result_list = []
        
    # Find the number of results
    URL_for_count = "http://www.indeed.com/jobs?q=data+scientist&l=Boston&as_phr=%22{}%22".format(query_string)
    soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

    results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
    number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

    # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
    i = int(number_of_results/100)
    for page_number in range(i + 1):
        URL_for_results = "http://www.indeed.com/jobs?q=data+scientist&as_phr=%22{}%22&l=Boston&limit=100&start={}".format(query_string, str(100 * page_number))
        soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
        results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})
            
        # Extract the ID for each job listing, and include the 'salary_over_X' level
        for x in results:
            result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], 1])
        
        # Add the job ID numbers
        job_ids = job_ids.append(result_list)
    
    # Rename job_ids's columns
    job_ids.columns = ['job_id', "{}".format(query_string)]
    
    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    
    return(job_ids)

In [19]:
#count_results("PhD")
count_results("Python")

Unnamed: 0,job_id,Python
0,jl_f43cd8061406b3d7,1
1,jl_b59a64d3298e7fe8,1
2,jl_fbadb59b5f73dc18,1
3,jl_90603c7f1f0af480,1
4,jl_70351b5092814475,1
5,jl_05de118416baeef8,1
6,jl_50022587c7a4a8d9,1
7,jl_44da2bd2b0b7e145,1
8,jl_f5945f64ec7013e3,1
9,jl_6c9d0349b46d0aae,1
