In [94]:
#########################
## Mark Bjerregaard     #
## MARKBJ@UMICH.EDU     #
#########################

import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import json
from bs4 import BeautifulSoup
import requests
import time
import json
import re
from datetime import date


'''
Linkedin job scrape script
Scrapes jobs using selenium and chromedriver available here: https://chromedriver.chromium.org/

Steps:
1. Enter Linkedin log-in page
2. Spot the cookies pop-up and accept cookies
3. Fill E-Mail Adress and Password areas and click login
4. Click on the jobs from the section above
5. Search for job positions Data Analyst 
6. Scroll till end of page, collecting links on the way
7. Go to the next page when it is the end of the page while keep collecting links
8. After all links are collected, go to each link
9. Click the see more button to expand the job description text
10. Scrape the desired data


Items to be scraped 
1. Job title 
2. Company name
3. Company location
4. job description
5. work method (hybrid, remote, on-site)
6. Post date
'''
''

def open_cache(cache_name):
    '''
    opens the cache file if it exists and loads the JSON into a dictionary, which it then returns
    If the cahce file doesn't exist, creates a new chace dictionary
    Parameters
    ---
    None 
    Returns
    ---
    The opened cache
    '''
    try:
        cache_file = open(cache_name, 'r', encoding='utf8')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}
    return cache_dict

def save_cache(cache_dict, cache_name):
    '''
    saves the current state of the cache to disk
    Parameters
    ---
    cache_dict: dict
        the dictionary to save
    Returns
    ---
    None
    '''
    dumped_json_cache = json.dumps(cache_dict, indent=4, ensure_ascii=False)
    fw = open(cache_name,'w', encoding='utf8')
    fw.write(dumped_json_cache)
    fw.close()


In [95]:
import time
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import json
from cache_functions import save_cache, open_cache

def job_scrape(driver):
    '''
    Function for scraping page of job links
    Input: chromedriver (current webpage)
    output: list of job site links
    ''' 
    links_cache = 'link_cache.json'
    link_dict = open_cache(links_cache)

    link_count = max(list(link_dict.values()))
    jobs_list = driver.find_elements(By.CLASS_NAME, 'job-box')
    for job in jobs_list:
        link = job.get_attribute('href')
        if not link_dict.get(link):
            link_dict[job.get_attribute('href')] = link_count + 1
            link_count += 1 

    save_cache(link_dict, links_cache)

    return link_dict


def Crawler(pages):
    page_iterations = pages
    search_area = 'Copenhagen Metropolitan Area'
    search_position = 'Data Analyst'
    # Chrome driver setup & browser open
    options = Options()
    options.add_argument('start-maximized')
    driver = webdriver.Chrome(service =Service('chromedriver.exe'), options=options)
    driver.implicitly_wait(10)

    # Open page
    driver.get('https://graduateland.com/jobs')
    time.sleep(2)

    # Filtering on area
    driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[1]/input').send_keys(search_area)
    driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[1]/input').click()
    driver.find_element(By.XPATH, '//*[@id="search-filters"]/div[46]/div/div[2]/label[2]').click()

    # Entering position filter keywords
    driver.find_element(By.XPATH, '//*[@id="job-search-form"]/div[1]/div[1]/div[1]/input').send_keys(search_position, Keys.ENTER)
    time.sleep(3)

    #initializing scrape loop at page 1
    page = 1 
    print('scraping job links, page:', page)
    links = job_scrape(driver)

    # Page iterator
    for i in range(1, page_iterations):
        # handling XPATH variations based on page numbers
        if i < 3:
            index = i * 2 
        elif i < 6:
            index = i + 2
        else:
            i = 7
        # Clicking next page button
        driver.find_element(By.XPATH, '//*[@id="timeline"]/div[3]/div/a[{}]'.format(index)).click()
        page += 1
        print('scraping job links, page:', page)
        time.sleep(3)
        # Scraping job post links from page
        links = job_scrape(driver)
        if i % 20 == 0:
            time.sleep(60)
    return 0

if __name__=='__main__':
    Crawler(50)

In [347]:
from bs4 import BeautifulSoup
import requests
import time
import json
import re
from datetime import date

def Offline_test(url):
    #returns True of job is offline
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    offline = soup.find('section', class_='job-offline')
    if offline == None:
        return False
    else:
        links_cache = 'link_cache.json'
        link_dict = open_cache(links_cache)
        del link_dict[url]
        save_cache(link_dict, links_cache)
        return True

def Scraper(url, vert_dict, store_cache):
    
    def attribute_extract(soup):
        attributes = soup.find('div', class_='content-description')
        attributes = attributes.find_all('p')
        last = ''
        attributes_list = [i.text.strip() for i in attributes if i.text != None]
        idx = 0
        for i in attributes_list:
            new_elem = re.sub('  ', '', i)
            new_elem = re.sub('\\n\\n\\n','___',new_elem)
            new_elem = new_elem.split('___')
            if len(new_elem) != 1:
                new_elem = [re.sub('\\n', '', i).lstrip() for i in new_elem]
                attributes_list[idx] = new_elem
            else:
                new_elem = re.sub('\\n',' ',new_elem[0])
                attributes_list[idx] = new_elem
            idx += 1 
        return attributes_list

    vertex_cache = 'vertex_cache.json'
    
    if vert_dict.get(url):
        attr_dict = vert_dict.get(url)
        return attr_dict
    else:
        attr_dict = {}



    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    
    attr_dict['link'] = url
    # Title 
    try:
        title_ = soup.find('div',class_='job-title').find('h1').text
        attr_dict['title'] = title_
    except:
        title_ = None
        attr_dict['title'] = None
    # Job description
    try:
        job_desc_ = soup.find('article', class_='box-item job-content').text.lstrip()
        attr_dict['job_desc'] = job_desc_
    except:
        attr_dict['job_desc'] = None
    # Expiration
    try:
        expiration = soup.find('span', class_= 'text-warning').text.strip()
    except: 
        expiration = None
    attr_dict['expiraton'] = expiration

    # Headline element
    headline = soup.find('div', class_='headline')
    # Company
    try:
        company_ = headline.find('h2').text.strip()
        attr_dict['company'] = company_
    except:
        try:
            attr_dict['company'] = re.findall('(?<=\sat).(.*)',title_)[-1]
        except:
            attr_dict['company'] = None
    # Industry    
    try:
        industry_ = headline.find_all('p')[0].text.strip()
        attr_dict['industry'] = industry_
    except:
        attr_dict['industry'] = None
    # Followers
    try:
        followers_ = headline.find_all('p')[1].text.strip()
        followers_ = re.sub('[^0-9]', '', followers_)
        attr_dict['followers'] = followers_
    except:
        attr_dict['followers'] = None
    #logo
    try:
        logo_soup = soup.find('div',class_='company-item')
        logo_ = logo_soup.find('img')['src']
        attr_dict['logo'] = logo_ 
    except:
        attr_dict['logo'] = None

    # Attributes (location, category, job_type, skills, language)
    attributes = attribute_extract(soup)
    try:
        location_ = attributes[0]
        attr_dict['location'] = location_
    except:
        attr_dict['location'] = None
    try:
        category_ = attributes[1]
        attr_dict['category'] = category_
    except:
        attr_dict['category'] = None
    try:
        job_type_ = attributes[2]
        attr_dict['job_type'] = job_type_
    except:
        attr_dict['job_type'] = None
    try:
        skills_ = attributes[3]
        attr_dict['skills'] = skills_
    except:
        attr_dict['skills'] = None
    try:
        language_ = attributes[4]
        attr_dict['language'] = language_
    except:
        attr_dict['language'] = None
   

    attr_dict['date_scraped'] = date.today().strftime("%d-%m-%Y")

    vert_dict[url] = attr_dict
    if store_cache == True:
        save_cache(vert_dict, vertex_cache)
    return attr_dict


class Vertex:
    def __init__(self, url, cache={}, store_cache=True):

        attr_dict = Scraper(url, cache, store_cache)
        self.attr_dict = attr_dict
        self.title = attr_dict.get('title')
        self.blob = attr_dict.get('job_desc')
        self.company = attr_dict.get('company')
        self.industry = attr_dict.get('industry')
        self.expiration = attr_dict.get('expiration') 
        self.followers = attr_dict.get('followers')
        self.logo = attr_dict.get('logo')
        self.location = attr_dict.get('location')
        self.category = attr_dict.get('category')
        self.job_type = attr_dict.get('job_type')
        self.skills = attr_dict.get('skills')
        self.language_req = attr_dict.get('language')
        self.date_scraped = attr_dict.get('date_scraped')
        self.link = url

        self.neighbors = {}

    def add_neighbor(self, neighbor, word):
        if self.neighbors.get(neighbor):
            self.neighbors[neighbor].append(word)
        else: 
            self.neighbors[neighbor] = [word]

    def get(self):
        return self.link
        
def Generate_Vertices(vertices_count=9999):
    links_cache = 'link_cache.json'
    vert_cache = 'vertex_cache.json'
    link_dict = open_cache(links_cache)
    vert_dict = open_cache(vert_cache)
    iterations_count = 0
    vertices = []
    
    for i in link_dict:
        iterations_count += 1
        if not vert_dict.get(i):
            if Offline_test(i):
                expired_count += 1
                continue
        vertices.append(Vertex(i, cache=vert_dict))

        if iterations_count == vertices_count:
            break
        
    return vertices
 
vertices_list = Generate_Vertices(10)

10
[<__main__.Vertex object at 0x00000197348D5610>, <__main__.Vertex object at 0x00000197348D5070>, <__main__.Vertex object at 0x00000197348D5310>, <__main__.Vertex object at 0x0000019734172FA0>, <__main__.Vertex object at 0x00000197341725E0>, <__main__.Vertex object at 0x00000197341726A0>, <__main__.Vertex object at 0x00000197341723A0>, <__main__.Vertex object at 0x00000197341725B0>, <__main__.Vertex object at 0x00000197341723D0>, <__main__.Vertex object at 0x0000019734172E20>]


In [348]:
import yake
from deep_translator import GoogleTranslator

def key_word_extractor(extractor, text):
    word_list = []
    keywords = extractor.extract_keywords(text)
    for kw in keywords:
        word_list.append(kw[0])
    return word_list


def generate_graph_dict(vertices_list, criteria):
    #cache load (3 categories, job_desc, skills, category)
    word_cache_str = 'word_cache.json'
    word_cache = open_cache(word_cache_str)
    word_dict = word_cache.get(criteria)
    if not word_dict:
        word_dict = {'words': {}, 'vertex':{}}

    #Yake (keyword generator) config
    language = "en"
    max_ngram_size = 3
    deduplication_threshold = 0.3
    numOfKeywords = 40
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    
    #Translator for non-english text
    translator = GoogleTranslator(source='auto', target='en')

    #graph
    vertices = {}

    iterations = 0
    for vertex_obj in vertices_list:
        vertex = vertex_obj.link
        if word_dict['vertex'].get(vertex):
            word_list = word_dict.get('vertex').get(vertex)

        else:
            word_dict['vertex'][vertex] = []
            text = vertex_obj.attr_dict.get(criteria)
            try: 
                text_translated = translator.translate(text[:3000])
            except:
                continue
            word_list = key_word_extractor(custom_kw_extractor, text_translated)
            for word in word_list:#doc.ents:
                if word_dict['words'].get(word):
                    #print(word_dict['words'].get(word))
                    word_dict['words'][word].append(vertex)
                else:
                    word_dict['words'][word] = [vertex]
                word_dict['vertex'].get(vertex).append(word)

        if not vertices.get(vertex):
            vertices[vertex] = vertex_obj

        for word in word_list:
            for neighbor in word_dict.get('words').get(word):
                if neighbor == vertex or not vertices.get(neighbor):
                    continue
                #print(vertices[neighbor])
                vertices[vertex].add_neighbor(vertices[neighbor], word)
                vertices[neighbor].add_neighbor(vertices[vertex], word)
        

    word_cache[criteria] = word_dict
    save_cache(word_cache, word_cache_str)
    return vertices



vertices_list = Generate_Vertices(100)
x = generate_graph_dict(vertices_list, 'job_desc')

In [351]:
import networkx as nx
from math import sqrt
import random
from bokeh.palettes import Spectral4, Turbo256
def generate_graph_network(graph_dict, threshold=5, rm_isolates=True):
    G = nx.Graph()
    x = graph_dict
    G.add_nodes_from(x)

    #generate color_map
    unique_companies = list(set([x[i].company for i in x]))
    colors = list(Turbo256)
    random.shuffle(colors)
    color_dict = {}
    for i in range(len(unique_companies)):
        color_dict[unique_companies[i]] = colors[i]

    edges = []
    node_attrs = {}

    for k in x:
        node_attrs[k] = {'company':x[k].company, 'title':x[k].title, 'color':color_dict[x[k].company], 'url':x[k].link}
        node_edges = [(k, i.link, {'company': x[k].company, 'title':x[k].title, 'words': x[k].neighbors[i], 'weight':len(x[k].neighbors[i])}) for i in x[k].neighbors if len(x[k].neighbors[i]) > threshold]
        edges = edges + node_edges

    G.add_edges_from(edges)
    nx.set_node_attributes(G,node_attrs)

    if rm_isolates:
        isolates = nx.isolates(G)
        G.remove_nodes_from(list(isolates))

    return G
graph = generate_graph_network(x)

In [353]:
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show, from_networkx
from bokeh.models import (BoxSelectTool, Circle, EdgesAndLinkedNodes, HoverTool,
                          MultiLine, NodesAndLinkedEdges, Plot, Range1d, TapTool, NodesOnly)
from bokeh.palettes import Spectral4, Turbo256
from bokeh.transform import linear_cmap

#output_file('output_graph.html', title='graph_network')
def visualize(G, inspect_edges=False):

    #for start_node, end_node, _ in G.nodes(data=True):
    #    node_color = 
    #    node_attrs[(start_node, end_node)] = node
    output_notebook()
    output_file("networkx_graph.html")
    plot = figure(title="Networkx Integration Demonstration", x_range=(-1.1,1.1), y_range=(-1.1,1.1))
    graph_renderer = from_networkx(G, nx.spring_layout, scale=1, center=(0,0))
    if not inspect_edges:
        graph_renderer.selection_policy = NodesAndLinkedEdges()
        graph_renderer.inspection_policy = NodesOnly()
        plot.add_tools(HoverTool(tooltips=[('company','@company'), ('title','@title'), ('url', '@url')]), TapTool(), BoxSelectTool())
    else:
        graph_renderer.selection_policy = NodesAndLinkedEdges()
        graph_renderer.inspection_policy = EdgesAndLinkedNodes()
        
        plot.add_tools(HoverTool(tooltips=[('company','@company'), ('title','@title'), ('url', '@url'), ('keywords', '@words')]), TapTool(), BoxSelectTool())
    


    graph_renderer.node_renderer.data_source.data['color'] = [i[1]['color'] for i in G.nodes(data=True)]
    graph_renderer.node_renderer.glyph = Circle(size=15, fill_color='color', )
    graph_renderer.node_renderer.selection_glyph = Circle(size=15, fill_color=Spectral4[2])
    graph_renderer.node_renderer.hover_glyph = Circle(size=15, fill_color=Spectral4[1])

    graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=5)
    graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=5)
    graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=5)
    
    plot.renderers.append(graph_renderer)

    show(plot)
visualize(graph)

{'Sund & Bælt Holding A/S': '#55fa76', 'Ingeniørforeningen, IDA': '#3c3285', 'Finans Danmark': '#db3a07', 'UNICEF Danmark': '#4673eb', 'Topsoe A/S': '#3f98fe', 'Cadeler A/S': '#a6fb3a', 'Magasin': '#4675ed', 'Danske Spil': '#b9f534', 'Erhvervsministeriet': '#7d0502', 'Transportministeriet': '#19e3b8', 'Vinderstrategi A/S': '#fda932', 'Økonomistyrelsen': '#424bb5', 'Deloitte Denmark': '#e1dc37', 'DM': '#3c9dfd', 'MAN Energy Solutions': '#e8d538', '2BM A/S': '#a61401', 'Arbejdsløshedskassen for Journalistik, Kommunikation og Sprog': '#fd952b', 'Milestone Systems': '#fcb035', 'Kemp & Lauritzen': '#4145ab', 'DAHL Advokatpartnerselskab': '#3a2c79', 'Energinet': '#c92903', 'Gladsaxe Kommune': '#98fe42', 'Ældre Sagen': '#59fb72'}


['Deloitte Denmark']


In [331]:
unique_companies = list(set([x[i].company for i in graph]))
colors = list(Turbo256)
random.shuffle(colors)
color_dict = {}
for i in range(len(unique_companies)):
    color_dict[unique_companies[i]] = colors[i]

print(color_dict.keys())

dict_keys(['Ingeniørforeningen, IDA', 'Omnicom Media Group', 'Finans Danmark', 'UNICEF Danmark', 'Dansk Industri', 'Cadeler A/S', 'Danske Spil', 'Erhvervsministeriet', 'Transportministeriet', 'Alm. Brand', 'Økonomistyrelsen', 'Deloitte Denmark', 'Albertslund Kommune', 'Skatteministeriet', 'DSV A/S', 'MAN Energy Solutions', '2BM A/S', 'De Danske Bilimportører', 'Arbejdsløshedskassen for Journalistik, Kommunikation og Sprog', 'Milestone Systems', 'Kemp & Lauritzen', 'KL - på vegne af de danske kommuner', 'AP Pension', 'KPMG Acor Tax', 'Demant', 'VisitDenmark', 'PwC Denmark', 'Ørsted', 'DAHL Advokatpartnerselskab', 'Lederne', 'Gladsaxe Kommune', 'Ældre Sagen'])


In [337]:
test1 = nx.get_node_attributes(graph,'color')
test2 = nx.get_node_attributes(graph,'company')
for i in test1:
    print(test2[i], test1[i])

Omnicom Media Group #a91501
Deloitte Denmark #3e9bfe
Deloitte Denmark #3e9bfe
Ingeniørforeningen, IDA #3c3285
Gladsaxe Kommune #4670e8
Arbejdsløshedskassen for Journalistik, Kommunikation og Sprog #392972
DAHL Advokatpartnerselskab #f8be39
Ingeniørforeningen, IDA #3c3285
Alm. Brand #ed550f
Ingeniørforeningen, IDA #3c3285
Økonomistyrelsen #f9ba38
MAN Energy Solutions #fa7d20
Transportministeriet #20e9ac
Milestone Systems #ae1801
Kemp & Lauritzen #c02302
Cadeler A/S #fda932
Cadeler A/S #fda932
Danske Spil #24eca6
Finans Danmark #bef334
2BM A/S #e1dc37
Erhvervsministeriet #51f979
DSV A/S #25c0e6
Ældre Sagen #8b0901
UNICEF Danmark #32184a
Albertslund Kommune #34aaf8
Demant #e04008
Skatteministeriet #9efd3e
Demant #e04008
Demant #e04008
Demant #e04008
PwC Denmark #4291fe
PwC Denmark #4291fe
Demant #e04008
Demant #e04008
Demant #e04008
AP Pension #fb8022
Lederne #f0cb3a
Dansk Industri #2db4f1
Ørsted #1bcfd4
Ørsted #1bcfd4
Ørsted #1bcfd4
Ørsted #1bcfd4
Ørsted #1bcfd4
Ørsted #1bcfd4
Ørsted #1b