## I. SETUP

In [1]:
import csv
from datetime import datetime
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import requests
import pandas as pd

### Set up model for query URL

In [2]:
# Generate an Indeed link based on specified job title and location
def get_url(position, location):
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url

In [3]:
# Set up query URL
url = get_url('data analyst', 'los angeles ca')

# to-do: get user input as variables for url

### Activate Splinter browser

In [4]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# Visit URL in Splinter Browser
browser.visit(url)




### Set up model for storing job listing info

In [5]:
# function to create a single job listing record
def get_info(listing):
    link = listing.h2.a
    url = 'https://www.indeed.com' + link.get('href')
    job_title = link.get('span','title')
    company_name = listing.find('span', 'companyName').text.strip()
    location = listing.find('div', 'companyLocation').text.strip() # may need further cleanup
    job_summary = listing.find('div', 'job-snippet').text.strip()
    job_summary.replace('\n', " ")
    date = listing.find('span', 'date').text.strip()
    date.replace("Posted "," ")
    today = datetime.today().strftime('%m-%d-%Y')
    try: 
        salary = listing.find('div', 'attribute_snippet').text.strip()
    except AttributeError:
        salary = 'not listed'
    source = 'Indeed'
        
    record = (company_name, job_title, location, salary, job_summary, date, url, source, today)
    
    return record

## II. RUNNING THE SCRAPER

### Set up model for scraper

In [6]:
def scrape(position, location, pages):
    url = get_url(position, location)
    
    for x in range(1, pages+1): # Loops through pages
        html = browser.html
        job_soup = soup(html, 'html.parser')
        job_listings = job_soup.find_all('div', 'cardOutline')
        
        for listing in job_listings: # Goes through each listing on the page
            record = get_info(listing)
            indeed_records.append(record)
            page = x
#           print(page)
            
        browser.find_by_css('a[aria-label="Next Page"]').click()
#     print (indeed_records)

### Run Scraper

In [7]:
indeed_records = []
scrape('data analyst', 'los angeles ca', 25)

### Save data to dataframe

In [8]:
# saves all listings in indeed_records to a dataframe
records_df = pd.DataFrame(indeed_records, columns = ['Company', 'Title', 'Location', 'Salary', 'Job Summary', 'Listing Date', 'Link', 'Source', 'Date Scraped'])

In [9]:
records_df.head()

Unnamed: 0,Company,Title,Location,Salary,Job Summary,Listing Date,Link,Source,Date Scraped
0,Dogtra Company,title,"Garden Grove, CA 92841","$70,000 - $80,000 a year",Coordinate with other departments such as Sale...,EmployerActive 2 days ago,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Indeed,04-01-2023
1,Spokeo,title,"Remote in Pasadena, CA 91101","$102,600 - $153,000 a year",5% - Inform and support the executive team in ...,PostedPosted 30+ days ago,https://www.indeed.com/rc/clk?jk=b1a6ad99b83ce...,Indeed,04-01-2023
2,TikTok,title,"Los Angeles, CA","$79,800 - $130,888 a year",This new security-first division was created t...,PostedPosted 15 days ago,https://www.indeed.com/rc/clk?jk=9603141b2a08a...,Indeed,04-01-2023
3,Kaiser Permanente,title,"Pasadena, CA","$85,800 - $110,990 a year",Prepares data for analytic efforts by consolid...,PostedPosted 4 days ago,https://www.indeed.com/rc/clk?jk=15566ec483dce...,Indeed,04-01-2023
4,SAG-Producers Pension Plan,title,"Burbank, CA 91505","$130,732 - $196,098 a year",Minimum 5 years in a quantitative data analyti...,PostedPosted 30+ days ago,https://www.indeed.com/rc/clk?jk=7d11d33788237...,Indeed,04-01-2023


## III. EXPORT

In [11]:
records_df.to_csv('J:\Documents\Projects\jobscraper\indeed_listings.csv', index=False)