In [57]:
from selenium import webdriver
#from bs4 import BeautifulSoup # For HTML parsing
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
from selenium.webdriver.common import action_chains, keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import pickle
import re
import csv
import os.path
#from collections import OrderedDict
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
from selenium.common.exceptions import ElementClickInterceptedException


### Glassdor Scrapping
Inspired by code created by Diego De Lazzari
Modified for Python 3 & minor editing for current Glassdoor Version by ourself

In [58]:
def init_driver():
    ''' Initialize chrome driver'''
    
    chrome_options = webdriver.ChromeOptions()
       
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--profile-directory=Default')
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument("--disable-plugins-discovery")
    chrome_options.add_argument("--start-maximized")
    #browser = webdriver.Chrome(driver, chrome_options=chrome_options)
    browser = webdriver.Chrome('/Users/truongnghi/Downloads/chromedriver-2',options=chrome_options)
    browser.maximize_window()
    browser.implicitly_wait(20)
    #browser = webdriver.Chrome()

    return browser

##############################################################################

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

###############################################################################

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
###############################################################################

def get_pause():
    return np.random.choice(range(4,6))

###############################################################################\


def searchJobs(browser, jobName, city=None, jobDict = None, link=None):
    '''Scrape for job listing'''

    ####&&&&
    #q = input('Shall we scrape? (y/n)\n') #q = raw_input('Shall we scrape? (y/n)')

    #if q=='y': ####&&&&
    if True:

        job = browser.find_element_by_id("sc.keyword")  #job title, keywords, or company
        location = browser.find_element_by_id("sc.location") #location search
        sleep(3)
        job.send_keys(jobName)  #type in job name in search
        sleep(2)
        #location form is already populated.
        location.clear()
        # can also execute JavaScript to clear it
        #browser.execute_script("arguments[0].value = ''", location)
        location.send_keys(city) #type in location name in search

        sleep(2)
        browser.find_element_by_class_name('gd-btn-mkt').click()

        sleep(5)
        

        # Find brief description


        for i in range(20): #20  ####&&&&
            try:
                # Extract useful classes
                jobPosting =browser.find_elements_by_class_name('jl')
                sleep(get_pause())

                # Create a job Dictionary. Every job in glassDoor has a unique data-id.
                # data-id should be used as key for the dictionary
                #create a map of 2-tuple. 2-tuple => data-id and selenium webElement.
                jobTuple = map(lambda a: (a.get_attribute('data-id'), a), jobPosting)

                # Filter picks out only those data-ids that are not in jobDict.keys()
                newPost = list(filter(lambda b: b[0] not in jobDict.keys(),jobTuple) ) #list of 2-tuple

                #If there are new posts, update job dict and link list
                if newPost != []:

                    # process the tuple
                    #example of a[1].text ->
                    #"3.7\nData Scientist, Analytics\nEtsy – Brooklyn, NY\n$114k-$167k  (Glassdoor Est.)\nWe're Hiring"
                    #tuple structure ('job_id',['rating','position','company','salary'])
                    #jobData = list(map(lambda a: (a[0],a[1].text.encode("utf8")./
                        #split('\n')[0:4]),newPost))
                    #jobData = list(map(lambda a: (a[0],a[1].text.split('\n')[0:4]),newPost))
                    #jobData = list(map(do_stuff, newPost)) ####&&&&
                    # do_stuff returns many misplaced entries.
                    #do_new_stuff uses regex to minimize bad data, it also splits up entries into more columns
                    # new tuple structure ('job_id',[rating, position, company, job_city, job_state_code, sal_low, sal_high])
                    print('starting do_new_stuff')
                    jobData = list(map(do_new_stuff, newPost))
                    print("I'm out of do_new_stuff.")

                    # Update job dictionary;
                    # Convert tuple to dictionary. structure ('job_id',['rating',...]) -> {'job_id':['rating',...]}
                    print('updating jobDict')
                    tmp = dict((a[0],a[1]) for a in jobData)
                    print('tmp created')
                    jobDict.update(tmp) #add a new entry with unique key job_id
                    # finally find the links:
                    link_lst = list(map(lambda c: (c[0],c[1].find_element_by_tag_name('a').\
                        get_attribute('href')), newPost))
                    #add the link to job dict
                    print('Adding to link')
                    tmp = [jobDict[c[0]].append(c[1]) for c in link_lst]
                    # update link list. This will be used in get_data part.
                    link += link_lst

                browser.find_element_by_xpath('//*[@id="FooterPageNav"]/div/ul/li[7]/a').click()
        
                
            except Exception as e:
                #pass
                print(type(e),e)

    return jobDict, link

###############################################################################

def text_cleaner(text):
    '''
    This function just cleans up the raw html so that I can look at it.
    Inputs: a URL to investigate
    Outputs: Cleaned text only
    '''
    print('starting text_cleaner')
    stopws = set(stopwords.words("english"))
    #print('initialized stopws')

    lines = (line.strip() for line in text.splitlines()) # break into lines
    #lines = [line.strip() for line in text.splitlines()]

    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    #chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]

    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out

    #print('Going for text!')
    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line


    # Now clean out all of the unicode junk (this line works great!!!)
    #print('cleaning out unicode junc from text!')
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception

    #print('getting rid of non-words from text!')
    text = re.sub(b"[^a-zA-Z.+3]",b" ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++

    #print('make text lower case!')
    text = text.lower()  # Go to lower case

    #print('split text!')
    text = text.split()  #  and split them apart

    #print('removing stop words!')
    text = [w for w in text if not w in stopws]


    #print('set of text')
    text = list(set(text)) # Last, just get the set of these. Ignore counts
                           # we are just looking at whether a term existed or not on the website

    #print("We are done! Let's return it!")
    return text


##############################################################################
def string_from_text(pattern, tmp_txt):
    lst  = tmp_txt.split('\n')
    return [''.join(x.split()[1:]) for x in lst if x.find(pattern) !=-1][0]

##############################################################################

def do_stuff(a):
    return (a[0],a[1].text.split('\n')[0:4])

##############################################################################

def do_new_stuff(a):
    print("I'm in do_new_stuff")
    if len(a) ==0:
        print('object is empty')

    tmp = a[1].text
    raw_rating = re.findall('\d\.\d',tmp )
    print('raw_rating = ',raw_rating)
    if len(raw_rating)==1:
        rating =raw_rating[0]
    else:
        rating = ''
    raw_sal_range = re.findall('\d+K',tmp )
    print('raw_sal_range = ',raw_sal_range)
    if len(raw_sal_range)==2:
        sal_low = int(raw_sal_range[0].replace('K',''))
        sal_high = int(raw_sal_range[1].replace('K',''))
    else:
        sal_low = np.nan
        sal_high = np.nan
    raw_company = re.findall('\w.+\n',tmp)[1]
    company=raw_company.replace('\n','')
    print('raw_company = ',raw_company)
    raw_city= re.findall('\w.+\n',tmp)[3]
    city_updated=raw_city.replace('\n','')
    print('raw_city= ',raw_city)
    raw_position = re.findall('(.+sci.+|.+ana.+|.+eng.+)',tmp.lower())
    print('raw_position = ',raw_position)
    if len(raw_position)==1:
        position = raw_position[0]
    else:
        position = tmp.split('\n')[1].lower()
    #return (a[0],tmp[0:4])
    print('Will go out of do_new_stuff.')
    return (a[0],[rating,position,company,city_updated,sal_low,sal_high])

In [59]:
try:
	jobDict = load_obj('glassDoorDict')
	link =    load_obj('glassDoorlink')
except:
	save_obj([], 'glassDoorlink')
	save_obj({}, 'glassDoorDict')

	jobDict = load_obj('glassDoorDict')
	link =    load_obj('glassDoorlink')

print('len(jobDict) = '+str(len(jobDict))+ ', len(link) = '+str(len(link)))
website = "https://www.glassdoor.com/blog/tag/job-search/"
browser = init_driver()
#browser = webdriver.Chrome()

len(jobDict) = 0, len(link) = 0


In [60]:
#Get Top 100 cities
import pandas as pd
MajorCities = pd.read_csv("Major Cities_updated.csv", encoding='utf-8')
city_temp=MajorCities['City'][1:100].to_list()
citi_list

['Chicago',
 'Houston',
 'Phoenix',
 'Philadelphia',
 'San Antonio',
 'San Diego',
 'Dallas',
 'San Jose',
 'Austin',
 'Jacksonville',
 'Fort Worth',
 'Columbus',
 'San Francisco',
 'Charlotte',
 'Indianapolis',
 'Seattle',
 'Denver',
 'Washington',
 'Boston',
 'El Paso',
 'Detroit',
 'Nashville',
 'Portland',
 'Memphis',
 'Oklahoma City',
 'Las Vegas',
 'Louisville',
 'Milwaukee',
 'Albuquerque',
 'Tucson',
 'Fresno',
 'Mesa',
 'Sacramento',
 'Atlanta',
 'Kansas City',
 'Colorado Springs',
 'Miami',
 'Raleigh',
 'Omaha',
 'Long Beach',
 'Virginia Beach',
 'Oakland',
 'Minneapolis',
 'Tulsa',
 'Arlington',
 'Tampa',
 'New Orleans',
 'Wichita',
 'Cleveland',
 'Bakersfield',
 'Aurora',
 'Anaheim',
 'Honolulu',
 'Santa Ana',
 'Riverside',
 'Corpus Christi',
 'Lexington',
 'Stockton',
 'Henderson',
 'Saint Paul',
 'St. Louis',
 'Cincinnati',
 'Pittsburgh',
 'Greensboro',
 'Anchorage',
 'Plano',
 'Lincoln',
 'Orlando',
 'Irvine',
 'Newark',
 'Toledo',
 'Durham',
 'Chula Vista',
 'Fort Wayne

In [None]:
iter_num = 0
while iter_num <101: # default 1 ####&&&&
		print('Starting iteration number {}'.format(iter_num))
		sleep(get_pause())
		browser.get(website)

		# Initialize cities and jobs

		jobName_lst = ['Data Analyst']
		jobName = np.random.choice(jobName_lst)
		#jobName = 'Data Scientist' ####&&&&
		city_lst = city_list
		#city= 'Gilbert'
		city = np.random.choice(city_lst)
		city_lst=[element for element in city_lst if element != city]
        
		#city = ' '  ####&&&&
		print('jobName = '+jobName+ ', city = '+city)

		# search for jobs (short description)
		try:
			# jobDict structure {'job_id':['rating','position','company','salary']}
			update_jobDict, update_link = searchJobs(browser, jobName, city, jobDict, link)
			sleep(get_pause())
		except Exception as e:
			print(type(e),e)
			sys.exit("Error message")
			#pass


		print('len(update_jobDict) = '+str(len(update_jobDict))+ ', len(update_link) = '+str(len(update_link)))

		# save dictionary and link

		save_obj(update_jobDict, 'glassDoorDict')
		save_obj(update_link, 'glassDoorlink')

		iter_num += 1

browser.close()

Starting iteration number 0
jobName = Data Analyst, city = Scottsdale
starting do_new_stuff
I'm in do_new_stuff
raw_rating =  ['3.7']
raw_sal_range =  ['50K', '85K']
raw_company =  HonorHealth

raw_city=  Scottsdale, AZ

raw_position =  ['data integrity analyst']
Will go out of do_new_stuff.
I'm in do_new_stuff
raw_rating =  ['3.3']
raw_sal_range =  []
raw_company =  NCU (Northcentral University)

raw_city=  Scottsdale, AZ

raw_position =  ['business data analyst']
Will go out of do_new_stuff.
I'm in do_new_stuff
raw_rating =  ['2.8']
raw_sal_range =  ['48K', '72K']
raw_company =  Blackhawk Network

raw_city=  Phoenix, AZ

raw_position =  ['business data analyst ii']
Will go out of do_new_stuff.
I'm in do_new_stuff
raw_rating =  ['4.1']
raw_sal_range =  []
raw_company =  Corbins Electric

raw_city=  Phoenix, AZ

raw_position =  ['data analyst/programmer']
Will go out of do_new_stuff.
I'm in do_new_stuff
raw_rating =  []
raw_sal_range =  []
raw_company =  Indeed Prime

raw_city=  Phoeni

In [56]:
jobDict

{'3346816932': ['3.0',
  'data analyst',
  'Daughters of Charity Health Centers',
  'New Orleans, LA',
  nan,
  nan,
  'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=389273&s=58&guid=0000016d258037f99cfd348d5b7ede6c&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&ea=1&cs=1_057e15ff&cb=1568292223678&jobListingId=3346816932'],
 '3332469151': ['',
  'louisiana std/hiv program',
  'Louisiana STD/HIV Program',
  'New Orleans, LA',
  nan,
  nan,
  'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=389273&s=58&guid=0000016d258037f99cfd348d5b7ede6c&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&ea=1&cs=1_0ecdc6d9&cb=1568292223679&jobListingId=3332469151'],
 '3299511982': ['2.7',
  'louisiana childrens medical co',
  'Louisiana Childrens Medical Co',
  'New Orleans, LA',
  36,
  62,
  'https://www.glassdoor.com/partner/jobListing.htm?pos=103&ao=4113&s=58&guid=0000016d258037f99cfd348d5b7ede6c&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&s

In [54]:
headers=['Rating','Position','Company','City','Min Salary','Max Salary','Link']
import numpy as np
import pandas as pd
overall = pd.DataFrame(update_jobDict)
overall=overall.transpose()
overall.columns=headers
overall['Company']=overall['Company'].str.replace('\n','')
overall['City']=overall['City'].str.replace('\n','')

In [55]:
overall

Unnamed: 0,Rating,Position,Company,City,Min Salary,Max Salary,Link
3346816932,3.0,data analyst,Daughters of Charity Health Centers,"New Orleans, LA",,,https://www.glassdoor.com/partner/jobListing.h...
3332469151,,louisiana std/hiv program,Louisiana STD/HIV Program,"New Orleans, LA",,,https://www.glassdoor.com/partner/jobListing.h...
3299511982,2.7,louisiana childrens medical co,Louisiana Childrens Medical Co,"New Orleans, LA",36.0,62.0,https://www.glassdoor.com/partner/jobListing.h...
3324078198,3.3,data analyst supporting the usao with security...,FSA,"New Orleans, LA",47.0,74.0,https://www.glassdoor.com/partner/jobListing.h...
