# Indeed Job Scraper

### SETUP

In [1]:
import csv
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import requests

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




In [3]:
source = 'Indeed'

In [4]:
# Generate an Indeed link based on specified job title and location
def get_url(position, location):
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [5]:
url = get_url('data analyst', 'los angeles ca')

# to-do: get user input as variables for url

In [6]:
# Visit URL in Splinter Browser
browser.visit(url)

In [7]:
# Parse HTML data using BS4
html = browser.html
job_soup = soup(html, 'html.parser')

In [8]:
job_listings = job_soup.find_all('div', 'cardOutline')

### CREATE THE MODEL FOR SINGLE JOB LISTING

In [9]:
listing = job_listings[0]

In [10]:
link = listing.h2.a

In [11]:
job_title = link.get('title')

In [12]:
job_url = 'https://www.indeed.com' + link.get('href')

In [13]:
company_name = listing.find('span', 'companyName').text.strip()

In [14]:
location = listing.find('div', 'companyLocation').text.strip() # may need further cleanup

In [15]:
job_summary = listing.find('div', 'job-snippet').text.strip()
job_summary.replace('\n', " ")

'5% - Inform and support the executive team in ad-hoc data analysis studies. We collect data from a wide variety of public sources and need someone with a…'

In [16]:
date = listing.find('span', 'date').text.strip()
date.replace("Posted "," ")

'Posted 30+ days ago'

In [17]:
today = datetime.today().strftime('%m-%d-%Y')

In [18]:
# Check for salary info
try: 
    salary = listing.find('div', 'attribute_snippet').text.strip()
except AttributeError:
    salary = 'not listed'

In [19]:
record = (company_name, job_title, location, salary, job_summary, date, job_url, source)
record

('Spokeo',
 None,
 'Remote in Pasadena, CA 91101',
 '$102,600 - $153,000 a year',
 '5% - Inform and support the executive team in ad-hoc data analysis studies.\nWe collect data from a wide variety of public sources and need someone with a…',
 'PostedPosted 30+ days ago',
 'https://www.indeed.com/rc/clk?jk=b1a6ad99b83ceed0&fccid=8f3c0373bcfa5520&vjs=3',
 'Indeed')

### CREATE MODEL FOR ALL JOB LISTINGS

In [20]:
def get_info(listing):
    link = listing.h2.a
    job_title = link.get('title')
    job_url = 'https://www.indeed.com' + link.get('href')
    company_name = listing.find('span', 'companyName').text.strip()
    location = listing.find('div', 'companyLocation').text.strip() # may need further cleanup
    job_summary = listing.find('div', 'job-snippet').text.strip()
    job_summary.replace('\n', " ")
    date = listing.find('span', 'date').text.strip()
    date.replace("Posted "," ")
    today = datetime.today().strftime('%m-%d-%Y')
    try: 
        salary = listing.find('div', 'attribute_snippet').text.strip()
    except AttributeError:
        salary = 'not listed'
    source = 'Indeed'
        
    record = (company_name, job_title, location, salary, job_summary, date, job_url, source)
    
    return record

In [21]:
#get_info(listing)

In [23]:
records = []

for listing in job_listings:
    record = get_info(listing)
    records.append(record)

In [24]:
records

[('Spokeo',
  None,
  'Remote in Pasadena, CA 91101',
  '$102,600 - $153,000 a year',
  '5% - Inform and support the executive team in ad-hoc data analysis studies.\nWe collect data from a wide variety of public sources and need someone with a…',
  'PostedPosted 30+ days ago',
  'https://www.indeed.com/rc/clk?jk=b1a6ad99b83ceed0&fccid=8f3c0373bcfa5520&vjs=3',
  'Indeed'),
 ('TikTok',
  None,
  'Los Angeles, CA+2 locations',
  '$79,800 - $130,888 a year',
  'This new security-first division was created to bring heightened focus and governance to our data protection policies and content assurance protocols to keep U…',
  'PostedPosted 14 days ago',
  'https://www.indeed.com/rc/clk?jk=9603141b2a08a9cd&fccid=caed318a9335aac0&vjs=3',
  'Indeed'),
 ('SAG-Producers Pension Plan',
  None,
  'Burbank, CA 91505',
  '$130,732 - $196,098 a year',
  'Minimum 5 years in a quantitative data analytics role with emphasis on data preparation, analysis and presentation.',
  'PostedPosted 30+ days ago',
 