# Installing Dependencies

In [1]:
!pip install selenium



In [2]:
import os
import sys
import time
import subprocess

from bs4 import BeautifulSoup as bs
import re as re
import pandas as pd
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Read in Inputs

In [3]:
input_path = Path(os.path.abspath('')).absolute() / 'input.xlsx'
raw_urls = pd.read_excel(input_path, index_col=0)['Public LI Profile']
raw_urls

NaN         https://www.linkedin.com/in/kree-zhang/
NaN    https://www.linkedin.com/in/michelleahn0418/
Name: Public LI Profile, dtype: object

# Start Chrome Driver

In [4]:
source = Path(os.path.abspath('')).absolute() / "chromedriver"
dest = os.path.dirname(sys.executable)
subprocess.call('cp {} {}'.format(source, dest), shell=True)

print('Copied ChromeDriver at {} to {}'.format(source, dest))

Copied ChromeDriver at /Users/mattzhou/Desktop/Projects/kree/chromedriver to /Users/mattzhou/opt/anaconda3/bin


In [5]:
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(options=chrome_options)

driver = webdriver.Chrome()

# Login to LinkedIn

In [8]:
USERNAME = input("Username: ")
PASSWORD = input("Password: ")

Username: mattzh1314@berkeley.edu
Password: 15aprilLC


In [9]:
driver.get("https://www.linkedin.com/login")

In [10]:
try:
    email=driver.find_element(By.ID, "username")
    email.clear()
    email.send_keys(USERNAME)

    password=driver.find_element(By.ID, "password")
    password.clear()
    password.send_keys(PASSWORD)

    password.send_keys(Keys.RETURN)
except:
    print("Already logged in!")

# Scrape Targets

In [23]:
def scrape(url, delay=10):
    driver.get(url)
    
    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, "experience-section")))
    except TimeoutException:
        raise TimeoutException("Scrape failed for {}".format(url))
    
    experiences = driver.find_element(By.ID, "experience-section")

    # Click the expand more button
    try:
        more_experiences = experiences.find_element(By.CLASS_NAME, 'pv-experience-section__see-more')
        more_experiences.find_element(By.CLASS_NAME, 'pv-profile-section__see-more-inline').click()
    except Exception:
        pass
        
    experiences_soup = bs(experiences.get_attribute('outerHTML'))

    single_experiences = [e.getText() for e in experiences_soup.find_all('div', class_="pv-entity__summary-info")]
    multi_experiences = [e.getText() for e in experiences_soup.find_all('div', class_="pv-entity__company-summary-info")]
    
    education = driver.find_element(By.ID, "education-section")
    education_soup = bs(education.get_attribute('outerHTML'))

    education = []
    for li_tag in education_soup.find('ul', class_="pv-profile-section__section-info"):
        try:
            school = li_tag.find('h3', {'class': "pv-entity__school-name"}).getText()
            details = [l.getText() for l in li_tag.find_all('span', class_="pv-entity__comma-item")]

            education.append('{} - {}'.format(school, details[0]))
        except Exception:
            pass
            
    return single_experiences, multi_experiences, education

In [24]:
results = {'Public LI Profile': [], 'Single Exp': [], 'Multi Exp': [], 'Education': []}
error_text = "Error"

for url in raw_urls:
    single, multi, education = [error_text], [error_text], [error_text]
    
    try:
        single, multi, education = scrape(url)
    except Exception:
        print(Exception)
    
    results['Public LI Profile'].append(url)
    results['Single Exp'].append(single)
    results['Multi Exp'].append(multi)
    results['Education'].append(education)

In [25]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Public LI Profile,Single Exp,Multi Exp,Education
0,https://www.linkedin.com/in/kree-zhang/,[\nStrategy Associate\nCompany Name\n\n E...,[\n\nCompany Name\nCMG Strategy Consulting\n\n...,"[University of California, Berkeley, Haas Scho..."
1,https://www.linkedin.com/in/michelleahn0418/,[\nBusiness Analyst Intern\nCompany Name\n\n ...,[\n\nCompany Name\nCMG Strategy Consulting\n\n...,"[University of California, Berkeley, Haas Scho..."


In [26]:
df_results['Education'][0]

['University of California, Berkeley, Haas School of Business - Bachelor of Science - BS',
 'UC Berkeley College of Engineering - The Sutardja Center for Entrepreneurship & Technology',
 'University of California, Berkeley - Bachelor of Arts - BA']

# Data Cleaning

In [27]:
cleaned = {'Public LI Profile': [], 'Previous Employers': [], 'Education': []}

for i in range(len(df_results)):
    single = df_results['Single Exp'][i]
    multi = df_results['Multi Exp'][i]
        
    temp = []
    if single[0] != error_text:
        for s in single:
            s = s.split('\n')
            s = [el for el in s if el]
            temp.append(s[2].strip())
    else:
        temp.append(error_text)
    
    if multi[0] != error_text:
        for m in multi:
            m = m.split('\n')
            m = [el for el in m if el]
            temp.append(m[1].strip())
    else:
        temp.append(error_text)
        
    cleaned['Previous Employers'].append(' | '.join(set(temp)))
    cleaned['Public LI Profile'].append(df_results['Public LI Profile'][i])
    cleaned['Education'].append(' | '.join(df_results['Education'][i]))
    
cleaned

{'Public LI Profile': ['https://www.linkedin.com/in/kree-zhang/',
  'https://www.linkedin.com/in/michelleahn0418/'],
 'Previous Employers': ['CMG Strategy Consulting | TED Conferences | ASUC Student Union | Ripple | FORM Life Ventures | Human Rights Watch | McKinsey & Company | EY-Parthenon',
  'CMG Strategy Consulting | Sports Business Group at Berkeley | Asthetic Studios | CONNECTED MOBILE | Cisco'],
 'Education': ['University of California, Berkeley, Haas School of Business - Bachelor of Science - BS | UC Berkeley College of Engineering - The Sutardja Center for Entrepreneurship & Technology | University of California, Berkeley - Bachelor of Arts - BA',
  'University of California, Berkeley, Haas School of Business - Bachelor of Science - BS | University of California, Berkeley - Bachelor of Arts - BA | Seoul International School - High School Diploma']}

In [28]:
df_cleaned = pd.DataFrame(cleaned)
df_cleaned

Unnamed: 0,Public LI Profile,Previous Employers,Education
0,https://www.linkedin.com/in/kree-zhang/,CMG Strategy Consulting | TED Conferences | AS...,"University of California, Berkeley, Haas Schoo..."
1,https://www.linkedin.com/in/michelleahn0418/,CMG Strategy Consulting | Sports Business Grou...,"University of California, Berkeley, Haas Schoo..."


# Download Results

In [104]:
output_path = Path(os.path.abspath('')).absolute() / 'output.csv'
df_cleaned.to_csv(output_path)