<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Packages" data-toc-modified-id="Import-Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Packages</a></span></li><li><span><a href="#Functions" data-toc-modified-id="Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Functions</a></span></li></ul></div>

# LinkedIn Scraper

## Import Packages

In [1]:
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium import webdriver
from config import LinkedInProfile

## Functions

In [16]:
def sign_in(driver, li_profile):
    """
    Function takes a webdriver object and a LinkedInProfile object and uses 
    the credentials from the LinkedInProfile object to log the driver into LinkedIn.
    
    Args:
        driver - Selenium WebDriver: WebDriver created by Selenium
        li_profile - LinkedInProfile: Object from config file containing LinkedIn account credentials
    Returns:
        None
    """
    
    driver.get('https://www.linkedin.com')
    driver.find_element_by_xpath("//nav/a[@class='nav__button-secondary']").click()

    time.sleep(2)
    
    username = driver.find_element_by_xpath("//input[@id='username']")
    password = driver.find_element_by_xpath("//input[@id='password']")

    username.send_keys(li_profile.username)
    password.send_keys(li_profile.password)
    
    time.sleep(1)
    
    driver.find_element_by_xpath("//button[@type='submit']").click()

def close_driver(driver):
    """
    Function will try to close any pages open with Driver.
    
    Doesn't always work but is never bad to run.
    
    Args:
        driver - Selenium WebDriver: WebDriver created by Selenium
    
    """
    try:
        driver.close()
    except:
        pass
    driver.quit()
    
def click_all_see_more_buttons(driver):
    """
    Scrolls through page and clicks all the "See more" buttons possibly found in Experiences, Education, 
    Volunteer Experiences and Licenses & Certifications
    
    Args:
        driver - Selenium WebDriver: WebDriver created by Selenium
    """
    see_more_button_xpath = "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
    show_more_elements = driver.find_elements_by_xpath(see_more_button_xpath)
    for button in show_more_elements:
        actions = ActionChains(driver)
        coordinates = button.location_once_scrolled_into_view # returns dict of X, Y coordinates
        driver.execute_script(f"window.scrollTo({coordinates['x']}, {coordinates['y']});")
        button.click()
    
def check_for_see_more_buttons(driver):
    """
    Checks webpage currently loaded in `driver` to see if there are still "See more" buttons remaining.
    
    Args:
        driver - Selenium WebDriver: WebDriver created by Selenium
    Returns:
        Boolean - TRUE/FALSE depending on whether there are more buttons
    """
    see_more_button_xpath = "//button[@class='pv-profile-section__see-more-inline pv-profile-section__text-truncate-toggle link link-without-hover-state']"
    show_more_elements = driver.find_elements_by_xpath(see_more_button_xpath)
    if len(show_more_elements) > 0:
        return True
    else:
        return False
    
    

def scrape_profile(driver, profile_url):
    """
    Function will use the provided driver and URL to scrape data from a LinkedIn page. Random notes below:
    
        - Program essentially goes in the order of the content on the page.
        - There are lots of hardcoded class names and ids here so there's a large chance something eventually breaks.
        - In headless mode, the section which expands the "about me" section and the experiences data collection does not work.
            - Neither are problems while running non-headless.
        - Right now function only gets data from Experiences and Education and skips Volunteer Experiences and Licenceses 
            but this only happens because I'm lazy
    
    Args:
        driver - Selenium WebDriver: WebDriver created by Selenium
        profile_url - Str: URL of profile to scrape data from
    Returns:
        profile_data - Dict: Data from the profile...
    """
    
    driver.maximize_window()
    profile_data = {}
    
    driver.get(profile_url)
    base_first_xpath = "//div[@class='application-outlet ']//div[@class='ph5 pb5']/div[@class='display-flex mt2']/div[@class='flex-1 mr5']"
    top_box_uls = driver.find_elements_by_xpath(f"{base_first_xpath}//ul/li")
    profile_data['profile_name'] = top_box_uls[0].text
    profile_data['connection_degree'] = top_box_uls[1].text.split('\n')[1]
    profile_data['profile_location'] = top_box_uls[3].text
    profile_data['profile_n_conn'] = top_box_uls[4].text.split(' ')[0]
    
    profile_data['profile_header'] = driver.find_element_by_xpath(f"{base_first_xpath}//h2").text
    
    about_xpath = "//div[@class='application-outlet ']//div[@class='profile-detail']/div/section/p/span"
    try:
        about_show_more_element = driver.find_element_by_xpath("//a[@id='line-clamp-show-more-button']")
    
        actions = ActionChains(driver)
        coordinates = about_show_more_element.location_once_scrolled_into_view # returns dict of X, Y coordinates
        driver.execute_script(f"window.scrollTo({coordinates['x']}, {coordinates['y']});")
    
        about_show_more_element.click()
    except Exception as e:
        print(e)
    about_me_lines = driver.find_elements_by_xpath(about_xpath)
    profile_data['profile_about_me'] = " ".join([line.text for line in about_me_lines])
    while check_for_see_more_buttons(driver):
        click_all_see_more_buttons(driver)
        
    base_metadata_xpath = '//section[@class="pv-profile-section pv-profile-section--reorder-enabled background-section artdeco-container-card ember-view"]'
    
    experience_dict = {}
    education_dict = {}
    lic_certif_dict = {}
    volunteer_dict = {}
    
    for section_index, section in enumerate(driver.find_elements_by_xpath(base_metadata_xpath + '/div')):
        section_title = driver.find_elements_by_xpath(base_metadata_xpath + '/div/section/header')[section_index].text
        
        section_id = driver.find_elements_by_xpath(base_metadata_xpath + '/div/section')[section_index].get_attribute('id')
        section_items = driver.find_elements_by_xpath(base_metadata_xpath + f'/div/section[@id="{section_id}"]/ul/li')
        item_ids = [item.get_attribute('id') for item in section_items]
        for item_index, item_id in enumerate(item_ids):
            item_xpath_base = base_metadata_xpath + f'/div/section[@id="{section_id}"]/ul/li[@id="{item_id}"]'
            item_dict = {}
            if section_title == 'Experience':
                expr_title = driver.find_element_by_xpath(item_xpath_base + '//h3').text
                expr_company = driver.find_element_by_xpath(item_xpath_base + '//p[@class="pv-entity__secondary-title t-14 t-black t-normal"]').text
                item_dict['title'] = expr_title
                item_dict['company'] = expr_company
                experience_dict[item_id] = item_dict
            if section_title == 'Education':
                item_dict['institution'] = driver.find_element_by_xpath(item_xpath_base + '//h3').text
                edu_p_list = driver.find_elements_by_xpath(item_xpath_base + '//p')
                
                if len(edu_p_list) >= 3:
                    item_dict['degree'] = edu_p_list[0].text.split('\n')[1] + ' ' + edu_p_list[1].text.split('\n')[1]
                    item_dict['years'] = edu_p_list[2].text.split('\n')[1]
                elif len(edu_p_list) == 1:
                    item_dict['years'] = edu_p_list[0].text.split('\n')[1]
                
                education_dict[item_id] = item_dict
                
    profile_data['experience'] = experience_dict
    profile_data['education'] = education_dict
        
    return profile_data



def collect_posts(driver, profile_url, n_pages = 2):
    """
    Function scans a profile's recent activity and records profiles, posts and links from the recent posts
    
    """
    ## https://www.linkedin.com/in/minaiskarous/detail/recent-activity/shares/
    ## https://www.linkedin.com/in/ckirkup/detail/recent-activity/shares/
    ## https://www.linkedin.com/in/julia-breed-11338915a/detail/recent-activity/shares/
    posts_dict = {}
    
    driver.get(profile_url + 'detail/recent-activity')
    
    search_xpath = "//div[@id='voyager-feed']/div"
    post_index = 0
    for page in range(n_pages):
        time.sleep(3)
        feed_elements = driver.find_elements_by_xpath(search_xpath)
        feed_ids = [item.get_attribute('id') for item in feed_elements]
        feed_ids = feed_ids[post_index:]
        for post_id in feed_ids:
            post_test_class = driver.find_elements_by_xpath(search_xpath[:-4] + f'/div[@id="{post_id}"]//div')
            if len(post_test_class) <= 1:
                time.sleep(3)
                continue
            post_dict = {} 
            profile_xpath = f'/div[@id="{post_id}"]//div[@class="display-flex feed-shared-actor display-flex ember-view"]/a'
            post_xpath = f'/div[@id="{post_id}"]//div[@class="feed-shared-update-v2__description-wrapper ember-view"]//span[@class="ember-view"]'
            article_xpath = f'/div[@id="{post_id}"]//article//a'
            try:
                post_dict['profile_link'] = driver.find_element_by_xpath(search_xpath[:-4] + profile_xpath).get_attribute('href')
            except:
                alt_profile_xpath = f'/div[@id="{post_id}"]/div[@class="display-flex feed-shared-actor display-flex ember-view"]/a'
                try:
                    post_dict['profile_link'] = driver.find_element_by_xpath(search_xpath[:-4] + alt_profile_xpath).get_attribute('href')
                except:
                    print(post_id)
                    print(" ".join([item.text for item in post_test_class]))
                    continue
            
            post_dict['post_time'] = driver.find_element_by_xpath(search_xpath[:-4] + profile_xpath + '//span[@aria-hidden="true"]').text
            try:
                post_dict['post_body'] = driver.find_element_by_xpath(search_xpath[:-4] + post_xpath).text
            except:
                post_dict['post_body'] = ''
            try:
                post_dict['post_link'] = driver.find_element_by_xpath(search_xpath[:-4] + article_xpath).get_attribute('href')
            except:
                post_dict['post_link'] = ''
            posts_dict[post_id] = post_dict
            actions = ActionChains(driver)
            coordinates = driver.find_element_by_xpath(search_xpath[:-4] + f'/div[@id="{post_id}"]').location_once_scrolled_into_view # returns dict of X, Y coordinates
            driver.execute_script(f"window.scrollBy(0, -7);")
            post_index += 1
    print(len(posts_dict.keys()))
    return posts_dict
    
    
    
    
    

In [18]:
profile_url = 'https://www.linkedin.com/in/marcie-matthews-6812a22/'
profile_url = 'https://www.linkedin.com/in/ckirkup/'
li_profile = LinkedInProfile()
### Checks if you have a driver open, if not, makes it == ''
#driver = driver or ''
if driver:
    try:
        collect_posts(driver, profile_url, 50)
    except:
        ### Assuming you don't have a driver,
        ### Create one
        driver = webdriver.Firefox()
        ### Sign it into LinkedIn
        sign_in(driver, li_profile)
        ### Scrape a specific profile
        #scrape_profile(driver, profile_url)
        ### Collect posts from a specific user's activity
        return_dict = collect_posts(driver, profile_url, 10)
else:
    ### Assuming you don't have a driver,
    ### Create one
    driver = webdriver.Firefox()
    ### Sign it into LinkedIn
    sign_in(driver, li_profile)
    ### Scrape a specific profile
    #scrape_profile(driver, profile_url)
    ### Collect posts from a specific user's activity
    return_dict = collect_posts(driver, profile_url, 10)
    

ember4313
Chris Kirkup’s job update
Status is reachable
Chris Kirkup
 • 1st
Data Science Partnerships at nference
2mo
2 months ago
Congratulate Chris for starting a new position as Data Science Partnerships at nference
24
4 Comments
Like
Comment
Reply to conversation with Congrats Chris
Congrats Chris
Reply to conversation with Congrats! Let's catch up
Congrats! Let's catch up
Reply to conversation with Congratulations!
Congratulations! Chris Kirkup’s job update Chris Kirkup’s job update  Status is reachable
Chris Kirkup
 • 1st
Data Science Partnerships at nference
2mo
2 months ago
Congratulate Chris for starting a new position as Data Science Partnerships at nference Status is reachable
Chris Kirkup
 • 1st
Data Science Partnerships at nference
2mo
2 months ago  Status is reachable Status is reachable Status is reachable Status is reachable Status is reachable Chris Kirkup
 • 1st
Data Science Partnerships at nference
2mo
2 months ago  • 1st Data Science Partnerships at nference 2mo
2 m

In [13]:
close_driver(driver)

In [19]:
return_dict

{'ember203': {'profile_link': 'https://www.linkedin.com/in/eric-nieves-3ba327170?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAACigd34BBQuxjJJQQtJ5ozeD3aEm8adh1XM',
  'post_time': '20h • \n  \n ',
  'post_body': 'I am beyond excited to announce that I have accepted my 2nd co-op at MFS Investment Management! I will be working in the Competitive Analysis position and cannot wait to take on this role in July. I hope everyone stays happy and healthy!',
  'post_link': ''},
 'ember206': {'profile_link': 'https://www.linkedin.com/in/robert-coughlin-03b5397?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAAAFTG-8BIGNxw4rAeyQr9M4qR9Bcd5iHTxA',
  'post_time': '1w • \n  \n ',
  'post_body': '',
  'post_link': 'https://www.bostonglobe.com/2020/03/05/opinion/how-greater-boston-biomedical-community-is-tackling-coronavirus/?outputType=amp&event=event25&__twitter_impression=true'},
 'ember209': {'profile_link': 'https://www.linkedin.com/in/cailin-joyce-8ab81222?miniProfileUrn=urn%3Ali%3Afs_miniProfile%