In [10]:
from selenium import webdriver
from bs4 import BeautifulSoup
from os import walk
import pymongo
import requests
import time
import json
import re

def getdriver ():
    path=r'chromedriver.exe'
    driver = webdriver.Chrome(executable_path=path)
    driver.implicitly_wait(10)
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(30)
    return driver

def writePage(fname,content):
    with open(fname, "w", encoding="utf-8") as file:
        file.write(str(content))

def read_file (name):
    HTMLFile = open(name, "rb")
    htmlfiledata = HTMLFile.read()
    return BeautifulSoup(htmlfiledata, 'lxml')

def connect_mongodb (db_name, collection_name):
    client = pymongo.MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    collection = db[collection_name]
    return collection

def loadWebsiteData (url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url,headers=headers)
        # Create a beautifulsoup object
        return BeautifulSoup(page.text, 'lxml')
    except :
        print("Error connecting to website")

def scroll_and_save(url,role):
    files_list = []
    driver = getdriver()
    driver.get(url)
    time.sleep(2)  
    scroll_pause_time = 1 
    screen_height = driver.execute_script("return window.screen.height;")
    i = 1
    link_num = 1
    while True:
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        scroll_height = driver.execute_script("return document.body.scrollHeight;") 
        try:
            button = driver.find_elements("xpath","//*[contains(text(), 'See more jobs')]")
            button[0].click()
        except:
            print("Button not found")
        if (screen_height) * i > scroll_height:
            break
    writePage(role+'_linkedin_search_results',driver.page_source)
    links = driver.find_elements("css selector","a.base-card__full-link.absolute")
    
    for link in links:
        fname = role+'_'+str(link_num)
        files_list.append(fname)
        time.sleep(2)
        writePage(fname,loadWebsiteData(link.get_attribute('href')))
        link_num = link_num + 1

    print(len(links))
    driver.close()
    return files_list

def parse_and_save(files_list):
    job_posting_list = []
    collection = connect_mongodb ('job_postings', 'job_postings_data')
    for file in files_list :
        job_soup = read_file("ba/"+file)
        role = job_soup.select("h1")[0].text if len(job_soup.select("h1"))>0 else 'NA'
        company = job_soup.select("a.topcard__org-name-link")[0].text.strip() if len(job_soup.select("a.topcard__org-name-link"))>0 else 'NA'
        location =  job_soup.select("span.topcard__flavor.topcard__flavor--bullet")[0].text.strip() if len(job_soup.select("span.topcard__flavor.topcard__flavor--bullet"))>0 else 'NA'
        info = job_soup.select("div.show-more-less-html__markup")[0].text.strip() if len(job_soup.select("div.show-more-less-html__markup"))>0 else 'NA'

        role_code = '3' #'ds'=1,'DA'=2,'BA'=3

        fields = job_soup.select("h3.description__job-criteria-subheader")
        values = job_soup.select("span.description__job-criteria-text.description__job-criteria-text--criteria")

        extra_info = {}

        js = job_soup.findAll('script',type='application/ld+json')
        if(len(js)>0):
            skills = []
            education = []
            salary_est = 'NA'
            years_exp = 'NA'

            json_script = json.loads(js[0].string)
            if 'skills' in json_script.keys():
                skills = json_script['skills']
                extra_info['Skills']=skills

            if 'educationRequirements' in json_script.keys():
                education = json_script['educationRequirements']['credentialCategory'] if 'credentialCategory' in json_script['educationRequirements'].keys() else 'NA'
                extra_info['Education']=education

            if 'experienceRequirements' in json_script.keys():
                if(isinstance(json_script['experienceRequirements'], str)):
                    if(json_script['experienceRequirements']=='no requirements'):
                        years_exp = 0
                    else:
                        years_exp='NA'
                else:
                    years_exp = json_script['experienceRequirements']['monthsOfExperience']/12 if 'monthsOfExperience' in json_script['experienceRequirements'].keys() else 'NA'
                extra_info['Years of Experience']= years_exp

            if 'baseSalary' in json_script.keys():
                salaryinfo = json_script['baseSalary']['value']
                salary_est = '$'+str(salaryinfo['minValue']) +' - '+ '$'+str(salaryinfo['maxValue'])+'/'+salaryinfo['unitText']
                extra_info['Salary Estimate']=salary_est

        for i in range(0,len(fields)):
            extra_info[fields[i].text.strip()] = values[i].text.strip()

        posting = {'Job Role':role,'Role Code':role_code,'Company':company,'Location':location,'Job Description':info,'Additional Details':extra_info}

        print(file)
        print(json.dumps(posting,indent = 4))

        job_posting_list.append(posting)

    #Remaining postings
    collection.insert_many(job_posting_list)

In [12]:
datascience_url = "https://www.linkedin.com/jobs/search?keywords=data%20scientist&location=california&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
dataanalyst_url = "https://www.linkedin.com/jobs/search/?currentJobId=3489147354&geoId=102095887&keywords=Data%20Analyst&location=California%2C%20United%20States&refresh=true"
businessanalyst_url = "https://www.linkedin.com/jobs/search?keywords=business%20analyst&location=california&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"

parse_and_save(scroll_and_save(datascience_url, 'ba'))
parse_and_save(scroll_and_save(dataanalyst_url, 'da'))
parse_and_save(scroll_and_save(businessanalyst_url, 'ba'))

TypeError: __init__() got an unexpected keyword argument 'executable_path'

### Glassdoor

### Go to all 30 search pages and save job postings

In [14]:
def get_jobs_from_glassdoor(url, prefix = 'data', n=20000):    
    pause_time = .2 
    n_per_page = 30 # default 
    driver = getdriver()
    driver_temp = getdriver()  
    driver.get(url)
    
    time.sleep(pause_time)
    for page in range(1,n_per_page+1): 
        ## Click 'x' on log-in popup        
        try:
            dummy_job = driver.find_element("css selector", "a.jobLink.css-1rd3saf.eigr9kq2").click()  
        except ElementClickInterceptedException:
            pass    

        time.sleep(pause_time)
        try:
            x_button = driver.find_element("xpath", "//*[@id='JAModal']/div/div[2]/span").click() 
        except NoSuchElementException:
            pass
        
        ## Get job links
        time.sleep(pause_time)
        job_buttons = driver.find_elements("css selector", "a.jobLink.css-1rd3saf.eigr9kq2")
        
        ## Save pages        
        i = 0
        for button in job_buttons:          
            i += 1
            
            time.sleep(pause_time)            
            try:
                button.click()
            except:
                print(f"try again(button.click): {prefix}_page_{page}_job_{i}")
                time.sleep(pause_time*5)
                button.click()
                
            detail_page_url = button.get_attribute("href")           

            time.sleep(pause_time)            
            try:
                driver_temp.get(detail_page_url)
            except:
                print(f"try again(driver_temp.get): {prefix}_page_{page}_job_{i}")
                time.sleep(pause_time*5)
                driver_temp.get(detail_page_url)
            
            time.sleep(pause_time)            
            f_nm = f"{prefix}_page_{page}_job_{i}.html"
            print(f_nm)        
            writePage(f_nm, driver_temp.page_source)    
            
            # for small size test
            c = (page-1)*n_per_page + i            
            if c >= n: #
                return
        
        ## Go to next page after n_per_page jobs
        time.sleep(pause_time)
        try:
            next_button = driver.find_element("css selector", "button.nextButton.css-1hq9k8.e13qs2071").click()
        except:
            print(f"try again(next_button): {prefix}_page_{page}_job_{i}")
            time.sleep(pause_time*5)
            next_button = driver.find_element("css selector", "button.nextButton.css-1hq9k8.e13qs2071").click()
            

    driver.quit()
    driver_temp.quit()        

In [None]:
datascience_url = "https://www.glassdoor.com/Job/california-data-science-jobs-SRCH_IL.0,10_IS2280_KO11,27.htm"
dataanalyst_url = "https://www.glassdoor.com/Job/california-data-analyst-jobs-SRCH_IL.0,10_IS2280_KO11,27.htm"
businessanalyst_url = "https://www.glassdoor.com/Job/california-business-analyst-jobs-SRCH_IL.0,10_IS2280_KO11,27.htm"

get_jobs_from_glassdoor(businessanalyst_url, prefix = 'ba')
get_jobs_from_glassdoor(datascience_url, prefix = 'ba')
get_jobs_from_glassdoor(dataanalyst_url, prefix = 'da')

### Parse through saved pages and extract information

In [None]:
def extract_job_info(file_list, verbose = True):
    job_posting_dict = []
    for file in file_list:
        print(f"reading: {file} ...")
        job_soup = read_file("ba/"+file)
           
        role_code = '3' #'ds'=1,'DA'=2,'BA'=3
        
        ## Find <script> tag
        def _has_no_attrs(tag):
            return tag.name == 'script' and not tag.attrs     
        js = job_soup.find(_has_no_attrs).text.replace("window.appCache=", "").replace(";", "")        
        json_script = json.loads(js)
                
        ## Extract
        try:
            role = job_soup.select_one("div[data-test='job-title']").find(text = True)
        except:
            role = -1
            
        try:
            company = job_soup.select_one("div[data-test='employer-name']").find(text = True)
        except:
            company = -1     
            
        try:
            location = job_soup.select_one("span[data-test='location']").find(text = True)
        except:
            location = -1  
            
        try:
            salary_est = job_soup.select_one("span.small.css-10zcshf.e1v3ed7e1").find(text = True, recursive = False).strip()
        except:
            salary_est = -1
        
        try:
            education = json_script["initialState"]["jlData"]["header"]["indeedJobAttribute"]["educationLabel"]
        except:
            education = -1
            
        try:
            skill = json_script["initialState"]["jlData"]["header"]["indeedJobAttribute"]["skillsLabel"]
        except:
            skill = -1
        
        try:
            years_exp = json_script["initialState"]["jlData"]["header"]["indeedJobAttribute"]["yearsOfExperienceLabel"]
        except:
            years_exp = -1
            
        try:
            info = re.sub("<.*?>", " ", json_script["initialState"]["jlData"]["job"]["description"])
        except:
            info = -1

        if verbose:
            print(f"\n<< File Name: {file} >>")
            print(f" - Role: {role}")
            print(f" - Company: {company}")
            print(f" - Location: {location}")
            print(f" - Salary Estimate: {salary_est}")
            print(f" - Education: {education}")
            print(f" - Skills: {skill}")
            print(f" - Years of Experience: {years_exp}")
            print(f" - Job Description: {info}")
        
        extra_info = {'Salary Estimate':salary_est,
                      'Education':education,
                      'Skills':skill,
                      'Years of Experience':years_exp}
                      
        posting = {'Job Role':role,
                   'Role Code':role_code,
                   'Company':company,
                   'Location':location,
                   'Job Description':info,
                   'Additional Details':extra_info}
        
        print(json.dumps(posting,indent = 4))   
        job_posting_dict.append(posting)
        
    return job_posting_dict

In [None]:
file_list = []
## Change path
for (dirpath, dirnames, filenames) in walk('C:/Users/Sripriya Srinivasan/Downloads/job-recommender-system/ba'):
    file_list.extend(filenames)
    break

# file_list is a list of saved html files from search results
job_posting_dict = extract_job_info(file_list, verbose = False)

In [None]:
## Store in mongoDB
collection = connect_mongodb ('job_postings', 'job_postings_data')
collection.insert_many(job_posting_dict)