# Web Scraping Indeed.com for job IDs, salary threshold

In [31]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [36]:
# String format: "keyword keyword"
def count_results_by_keyword(query_string = None):
    if query_string == None:
        return(print("No keyword entered."))
    
    query = "%20OR%20".join(query_string.split(" "))

    job_ids = pd.DataFrame()
    
    result_list = []
        
    # Find the number of results
    URL_for_count = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston".format(query)
    soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

    results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
    number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

    # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
    i = int(number_of_results/100)
    for page_number in range(i + 1):
        URL_for_results = "http://www.indeed.com/jobs?q=data+scientist+%28{}%29&l=Boston&limit=100&start={}".format(query, str(100 * page_number))
        soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
        results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})
            
        # Extract the ID for each job listing
        for x in results:
            result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], 1])
        
        # Add the job ID numbers
        job_ids = job_ids.append(result_list)
    
    # Rename job_ids's columns
    job_ids.columns = ['id', "{}".format(" OR ".join(query_string.split(" ")))]
    
    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    return (job_ids)
    #job_ids.to_csv(path_or_buf="id_and_{}.csv".format(query))
    
def count_results_by_salary(salary_range_divider = 90000):
    # OPTIONAL: removes low values
    if salary_range_divider <= 20000:
        return(print("Enter a number larger than $20,000."))
        
    job_ids = pd.DataFrame()
    # Set dividing salaries
    divider_strings = ["+$20000-{}".format(salary_range_divider), "+${}".format(salary_range_divider)]
    
    # Perform two searches, starting with the low-salary jobs
    for level, salary_criterion in enumerate(divider_strings):
        result_list = []
        
        # Find the number of results
        URL_for_count = "http://www.indeed.com/jobs?q=data+scientist" + salary_criterion + "&l=Boston"
        soup_for_count = BeautifulSoup(urlopen(URL_for_count).read(), 'html.parser')

        results_number = soup_for_count.find("div", attrs = {"id": "searchCount"}).text
        number_of_results = int(results_number.split(sep = ' ')[-1].replace(',', ''))

        # Now loop through the pages. Viewing 100 results at a time means fewer page refreshes.
        i = int(number_of_results/100)
        for page_number in range(i + 1):
            URL_for_results = "http://www.indeed.com/jobs?q=data+scientist{}&l=Boston&limit=100&start={}".format(salary_criterion, str(100 * page_number))
            soup_for_results = BeautifulSoup(urlopen(URL_for_results).read(), 'html.parser')
            results = soup_for_results.find_all('div', attrs={'data-tn-component': 'organicJob'})
            
            # Extract the ID for each job listing, and include the 'salary_over_X' level
            for x in results:
                result_list.append([x.find('h2', attrs={"class": "jobtitle"})['id'], level])
        
        # Add the job ID numbers
        job_ids = job_ids.append(result_list)
    
    # Rename job_ids's columns
    job_ids.columns = ['id', "salary_over_{}".format(salary_range_divider)]
    
    # Remove re-posted jobs
    job_ids.drop_duplicates(inplace = True)
    
    return(job_ids)

In [28]:
phd_dataframe = count_results_by_keyword("PhD ph.d")
python_dataframe = count_results_by_keyword("Python")
startup_dataframe = count_results_by_keyword("Startup start-up")
scientist_dataframe = count_results_by_keyword("Scientist")
salary_dataframe = count_results_by_salary(90000)

In [29]:
master_dataframe = phd_dataframe.merge(python_dataframe, on = 'id', how = 'outer').merge(startup_dataframe, on = 'id', how = 'outer').merge(scientist_dataframe, on = 'id', how = 'outer').merge(salary_dataframe, on = 'id', how = 'outer').fillna(value = 0)
master_dataframe.ix[:, master_dataframe.columns != 'id'] = master_dataframe.ix[:, master_dataframe.columns != 'id'].astype(int)

In [30]:
master_dataframe

Unnamed: 0,id,PhD OR ph.d,Python,Startup OR start-up,Scientist,salary_over_90000
0,jl_7b62a40f3ea90c00,1,0,0,1,0
1,jl_32062be6a68c7531,1,1,0,1,1
2,jl_033222419605ed3f,1,0,0,1,1
3,jl_c70cb33067b1f007,1,1,0,1,1
4,jl_1f1198f76898781f,1,1,0,1,1
5,jl_65d62034685dd0ed,1,0,0,0,0
6,jl_25732595a7113f45,1,1,0,1,1
7,jl_e632343c455d80f9,1,1,0,1,0
8,jl_0bb820bd8ae6e87b,1,0,0,1,0
9,jl_70351b5092814475,1,1,0,1,1
