In [2]:
import selenium
from selenium import webdriver

In [98]:
import pandas as pd
from pandas import DataFrame
import time
import re

## Prepping before the scrape
- load the correct site with query search pre-filled
- pre-load number of job posting(starting at 1000 posts)
- define helper functions

In [19]:
# make sure to download the correct chrome driver, update path to its directory
wd = webdriver.Chrome(executable_path='./chromedriver.exe')
url = 'https://www.linkedin.com/jobs/search?keywords=Data%20Analyst&location=San%20Francisco%20Bay%20Area&locationId=&geoId=90000084&sortBy=R&f_TPR=&f_E=2%2C3&position=1&pageNum=0'
wd.get(url)
wd.maximize_window()

In [83]:
# convert str result of job counts into int
no_of_jobs = wd.find_element_by_css_selector('h1>span').get_attribute('innerText')
no_of_jobs_str = no_of_jobs
no_of_jobs_str = no_of_jobs_str[:-1]
no_of_jobs_int = int(no_of_jobs_str.replace(',',''))
no_of_jobs_int

4000

In [84]:
# expanding the listing to get a total of no_of_jobs_in posts or 1000 posts
i = 2

# while i <= int(no_of_jobs_int/25) + 1:
while i <= int(1000/25) + 1: # just want about 1000 postings
    wd.execute_script('window.scrollTo(0,document.body.scrollHeight);')
    i = i+1
    try:
        wd.find_element_by_class_name('infinite-scroller__show-more-button.infinite-scroller__show-more-button').click()
        time.sleep(5)
    except:
        pass
        time.sleep(5)

In [100]:
# filtering out unwanted job posts
def jobIsNotWanted(title):
    if re.search('analy', title.lower()) and not ('senior' in title.lower() or 'sr.' in title.lower() or 'lead' in title.lower()): # a bit lengthy, could be cleaner
        return False
    return True

In [65]:
# click the show more button to expand the job post description html panel 
def clickShowMore(section):
    buttons = section.find_elements_by_tag_name('button')
    
    if not len(buttons) < 1:
        for button in buttons:
            try:
                if button.text == 'Show more':
                    button.click()
                    break
            except:
                pass

## Scraping process:
- defined desire fields(id, title, company, location, description).
- grab all job posting and iterate over each post
- grab and fill the defined fields
- skip over any posting without a populated description panel or missing show more button since they are incomplete data.

In [104]:
# define fields
job_id = []
job_title = []
company_name = []
location = []
description = {}

In [105]:
##### use this block if list expanded to total posting before hand
job_listing = wd.find_element_by_class_name('jobs-search__results-list')
jobs = job_listing.find_elements_by_tag_name('li')

for job in jobs:
    # click on each li ele to populate the job card 
    job.click()
    time.sleep(0.1)
    
    section = wd.find_element_by_xpath('/html/body/div[1]/div/section')
    clickShowMore(section)

#     get job id and title, filtering out existing and non-desirable job based on title
    url = wd.current_url
    url_parts = url.split('JobId=')
    title = job.find_element_by_css_selector('div>h3')
    # move on to next iteration if already exist and job is not wanted
    if((url_parts[1] in job_id or jobIsNotWanted(title.text)) and len(section.text) <= 1): 
        continue
        
    job_id.append(url_parts[1])
    job_title.append(title.text)

    # get the company name
    company = job.find_element_by_css_selector('div>h4')
    company_name.append(company.text)

    # job location
    job_location = job.find_element_by_class_name('job-search-card__location')
    location.append(job_location.text)
    
    # job description
    desc = section.text
    description[url_parts[1]] = desc
    
    time.sleep(1)

## Converting data to DF and read/write to .csv files
- save to file so they can be work on at a later time.
- preventing repeating the process if unable to complete. 

In [108]:
job_df = pd.DataFrame()

job_dict = {
    'id': job_id,
    'title': job_title,
    'company_name': company_name,
    'location': location
}

job_df = pd.DataFrame(job_dict)
job_df.to_csv('jobs.csv', index = False)

In [109]:
job_desc_df = pd.DataFrame.from_dict({'job_id': description.keys(), 'description':description.values()})
job_desc_df.to_csv('jobs_desc.csv', index = False)

# Scrubbing and handling null values
- separating city and state fields
- find zipcode
- remove duplicates, if any, and remove all null for both job/job_desc csv files 

In [73]:
# drop nulls from desc csv
non_null_jobs_desc = pd.DataFrame(jobs_desc_df.dropna(how = 'any'))
desc_id = non_null_jobs_desc.index
non_null_jobs_desc['desc_id'] = desc_id
non_null_jobs_desc.to_csv('filtered_job_desc.csv', index = False)

486


In [75]:
# dropping rows from job_id csv if they have been drop in desc csv
new_jobs_df = pd.DataFrame(jobs_df)
temp_job_id = jobs_df['id'].values
job_id_from_desc = non_null_jobs_desc['job_id'].values

for i in temp_job_id:
    if not i in job_id_from_desc:
        row = (new_jobs_df.loc[temp_jobs_df['id'] == i]).index
        new_jobs_df.drop(index = row, inplace = True)

In [32]:
from uszipcode import SearchEngine, SimpleZipcode, Zipcode

# zipcode finder helper function
def getZipcode(city_name):
    list_zipcode = []
    search = SearchEngine()
    zipcode = search.by_city_and_state(city_name, 'CA')
    if len(zipcode) < 1:
        return None
    return zipcode[0].zipcode

In [24]:
# separate city name and state
city_state = new_jobs_df['location'].values
city = []
state = []

for city_name in city_state:
    if ', CA' in city_name:
        location_parts = city_name.split(', ')
    else:
        location_parts[0] = 'San Francisco'
        location_parts[1] = 'CA'
    city.append(location_parts[0])
    state.append(location_parts[1])

new_jobs_df['city'] = city
new_jobs_df['state'] = state

In [36]:
#extracting zipcode
job_locations = new_jobs_df['location'].values
location_zipcode = []

for location in job_locations:
    zipc = getZipcode(location)
    location_zipcode.append(zipc)

new_jobs_df['zipcode'] = location_zipcode

In [40]:
# save to new csv
new_jobs_df.to_csv('filtered_job.csv', index = False)

# Analyzing job description for tools required and experience levels
- check and create csv file including tools list:
    - SQL, python, R, Tableau, Power BI, Excel, Looker, Powerpoint, Google Sheet/Gsuite, Jupyter, etc...
- what are the experience level does each job required    

In [None]:
# move on to Text Analysis NLP(NLTK) project 