# Web Scraping for Indeed.com

#### Setup a request (using `requests`) to the URL below. Use BeautifulSoup to parse the page and extract all results (HINT: Look for div tags with class name result)

In [4]:
URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"

In [5]:
import requests
import bs4
from bs4 import BeautifulSoup
from IPython.display import HTML
import re
import numpy as np
import pandas as pd

In [6]:
## YOUR CODE HERE

i = requests.get(URL)
i_soup = BeautifulSoup(i.content, "lxml")
i_soup

### Write 4 functions to extract each item: location, company, job, and salary.

example: 
```python
def extract_location_from_result(result):
    return result.find ...
```


- Make sure these functions are robust and can handle cases where the data/field may not be available.
- Test the functions on the results above

In [7]:
## YOUR CODE HERE

def mark_sponsored(soup_file):
    sponsored_col = []
    for result in soup_file.findAll('div', {'class' : re.compile('row.*')}):
        if len(result.findAll('span', class_='sdn')) > 0 :
            entry = 'True'
            sponsored_col.append(entry)
        else:
            entry = 'False'
            sponsored_col.append(entry)
    return sponsored_col

def extract_location_from_result(soup_file):
    location_col = []
    for d in soup_file.findAll('span', class_='location'):
        entry = d.text
        location_col.append(entry)
    return location_col

def extract_company_from_result(soup_file):
    company_col = []
    for d in soup_file.findAll('span', class_='company'):
        entry = d.text
        company_col.append(entry)
    return company_col

def extract_job_from_result(soup_file):
    job_col = []
    for result in soup_file.findAll('a', {'data-tn-element' : re.compile('job.*')}):
        entry = result.text
        job_col.append(entry)
    return job_col

def extract_salary_from_result(soup_file):
    salary_col = []
    for result in soup_file.findAll('div', {'class' : re.compile('row.*')}):
        sal = result.findAll('nobr')
        if len(sal) > 0 :
            rstring = str(sal)
            salary_col.append(rstring)
        else:
            salary_col.append(np.nan)
    return salary_col

In [8]:
type(i_soup)

bs4.BeautifulSoup

In [9]:
print mark_sponsored(i_soup)
print
print extract_job_from_result(i_soup)
print
print extract_company_from_result(i_soup)
print
print extract_location_from_result(i_soup)
print 
print extract_salary_from_result(i_soup)

['True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'True']

[u'Machine Learning Quantitative Analyst', u'Data Engineer', u'Deep Learning Data Scientist', u'AVP Quantitative Analyst', u'Senior Data Scientist', u'Reporting Analyst', u'Experiment Data Analyst / Statistician', u'Senior Data Scientist', u'Senior Data Scientist', u'Senior Data Scientist - Verticals', u'Research Scientist', u'Quantitative Analyst', u'Data Scientist / Lead Quantitative Analyst', u'Data Scientist, Imaging Informatics', u'Data Scientist - Relocation to China (Chinese Visa required)']

[u'\n\n        Bloomberg', u'\n\n        Indeed', u'\n    WorkFusion', u'\n\n\n        Barclays\n', u'\n\n    7Park Data\n', u'\n\n\n        PlaceIQ\n', u'\n\n\n        Dow Jones\n', u'\n\n\n        Capital Group\n', u'\n\n\n        CAPCO\n', u'\n\n\n        Bloomberg\n', u'\n\n\n        Yahoo! Inc.\n', u'\n\n    StreetID\n', u'\n\n\n        Guidepoint Global\n

In [10]:
def build_jobpost_db(soup_file):

    company = extract_company_from_result(soup_file) 
    job = extract_job_from_result(soup_file)
    location = extract_location_from_result(soup_file)
    salary = extract_salary_from_result(soup_file)
    sponsored = mark_sponsored(soup_file)
    
    jobdf = pd.DataFrame({'Job' : job, 'Company' : company, 'Location' : location, 'Salary' : salary, 'Sponsored' : \
                                                                                          sponsored})
    
    return jobdf

In [11]:
build_jobpost_db(i_soup)

Unnamed: 0,Company,Job,Location,Salary,Sponsored
0,\n\n Bloomberg,Machine Learning Quantitative Analyst,"New York, NY",,True
1,\n\n Indeed,Data Engineer,"New York, NY 10036",,True
2,\n WorkFusion,Deep Learning Data Scientist,"New York, NY",,True
3,\n\n\n Barclays\n,AVP Quantitative Analyst,"New York, NY",,False
4,\n\n 7Park Data\n,Senior Data Scientist,"New York, NY",,False
5,\n\n\n PlaceIQ\n,Reporting Analyst,"New York, NY",,False
6,\n\n\n Dow Jones\n,Experiment Data Analyst / Statistician,"New York, NY 10001 (Chelsea area)",,False
7,\n\n\n Capital Group\n,Senior Data Scientist,"New York, NY 10111 (Midtown area)",,False
8,\n\n\n CAPCO\n,Senior Data Scientist,"New York, NY 10271 (Financial District area)",,False
9,\n\n\n Bloomberg\n,Senior Data Scientist - Verticals,"New York, NY",,False


#### Scaling code to collect results from multiple cities and starting points. 

In [12]:

max_results_per_city = 10

results = []

for city in set(['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Denver', 'Las+Vegas', 'San+Jose', 'Boulder', 'Covallis','Seattle', 'Fort+Collins', 'Provo', 'Burlington', 'Boston']):
    for start in range(0, max_results_per_city, 10):
        URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={city}&start={start}".format(**locals())
        # Grab the results from the request (as above)
        i = requests.get(URL)
        i_soup = BeautifulSoup(i.content, "lxml")
        # Append to the full set of results
        results.append(i_soup)


#### Use the functions you wrote above to parse out the 4 fields - location, title, company and salary. Create a dataframe from the results with those 4 columns.

In [13]:
## YOUR CODE HERE
list_of_dfs = []

for soup_file in results:
    list_of_dfs.append(build_jobpost_db(soup_file))

jobsdf = pd.concat(list_of_dfs)
jobsdf = jobsdf.reset_index()
    
jobsdf

Unnamed: 0,index,Company,Job,Location,Salary,Sponsored
0,0,\n\n Return Path,Data Scientist,"Broomfield, CO 80021",,True
1,1,\n RefactorU,Data Scientist (Curriculum Developer / Instruc...,"Boulder, CO",,True
2,2,\n\n Pearson,Machine Learning Engineer,"Centennial, CO",,True
3,3,\n\n CaliberMind\n,Senior Data Scientist,"Boulder, CO","[<nobr>$120,000 a year</nobr>]",False
4,4,\n\n RefactorU\n,Data Scientist (Curriculum Developer / Instruc...,"Boulder, CO",,False
5,5,\n\n\n Oracle\n,Senior Data Scientist - Retail Analytics,"Westminster, CO",,False
6,6,\n\n\n Google\n,"Cloud Instructor (Big Data, Machine Learning),...","Boulder, CO 80302",,False
7,7,\n\n\n University of Colorado\n,Managing Director for STROBE,"Boulder, CO","[<nobr>$100,000 - $115,000 a year</nobr>]",False
8,8,"\n\n Wiland, Inc.\n",Statistician,"Niwot, CO 80503",,False
9,9,\n\n\n Agilent\n,Analytical Services Scientist,"Boulder, CO",,False


In [28]:
## Save as csv
jobsdf.to_csv('jobScrape.csv', encoding='utf-8')

In [29]:
jobsdf

Unnamed: 0,Company,Job,Location,Salary,Sponsored
0,\n LENA Research Foundation,Sr. Research Engineer: Machine Learning and Co...,"Boulder, CO",,True
1,"\n\n Alteryx, Inc.","Product Manager, Advanced Analytics","Broomfield, CO",,True
2,\n\n CyberCoders,Data Scientist,"Boulder, CO",,True
3,\n\n\n Navigant Consulting\n,Research Analyst,"Boulder, CO",,False
4,\n\n cliexa\n,Data Scientist Intern,"Boulder, CO",,False
5,\n\n\n University of Colorado\n,Professional Research Asst,"Boulder, CO",,False
6,"\n\n Somalogic, Inc.\n",Bioinformatics Assoc. II/Sr. Assoc./Analyst/Sc...,"Boulder, CO 80301",,False
7,\n\n\n Oracle\n,"Senior Product Manager - Measurement, Oracle D...","Broomfield, CO",,False
8,\n\n\n Ibotta\n,Senior Data Scientist,"Denver, CO 80202 (Lodo area)",,False
9,\n\n\n University of Colorado\n,Research Associate,"Boulder, CO",,False
