# LinkedIn Scraper

In [1]:
# Scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException

# Misc
import time
from random import randint
from pathlib import Path
import time

# Other modules
from secrets import u_name, pwd

#CSV
import pandas as pd
from pandas.errors import EmptyDataError
from csv import writer, reader

### Global variables

In [2]:
# Name of the .csv database
database_name = 'linkedin_database_contacts'

# Variable for not exceeding daily number of clicks
clicks = 0

# Create file if it doesn't exist, otherwise do nothing
file = Path(database_name+".csv")
file.touch(exist_ok=True)

# Get the first two columns of the database
with open(database_name+".csv", 'r') as r_obj:
    csv_r = reader(r_obj)
    filtered = [i[0:2] for i in csv_r]
    r_obj.close()

# Converting the csv to DataFrame object
try:
    old_df = pd.read_csv(str(file))
except EmptyDataError:
    old_df = pd.DataFrame()

## Connections

In [3]:
# Initialize webdriver
def start():
    
    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    driver.get('https://www.linkedin.com/login')
    
    return driver

# Login
def login():
    
    time.sleep(randint(2,4))

    username = driver.find_element_by_xpath('//input[@name="session_key"]')
    password = driver.find_element_by_xpath('//input[@name="session_password"]')

    username.send_keys(u_name)
    password.send_keys(pwd)

    time.sleep(randint(2,4))

    submit = driver.find_element_by_xpath('//button[@type="submit"]').click()
    time.sleep(randint(2,4))
    
    return None

# 'My Network' button
def click_my_network():
    
    my_network = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@data-link-to="mynetwork"]'))).click()

    return None

# 'Connections' button
def click_my_connections():
    
    my_connections = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//div[@class="mn-community-summary__entity-info"]'))).click()
    
    return None

# Getting the total number of connections
def get_connections():

    number_of_connections = int(WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//header[@class="mn-connections__header"]'))).text.split()[0])
    
    return number_of_connections

# Getting the number of visible connections (html)
def get_visible_connections():
    
    visible_connections = len(WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="scaffold-finite-scroll__content"]/ul/li[@class="mn-connection-card artdeco-list"]'))))
    
    return visible_connections

# Scrolling down until the number of visible connections in the page found is equal to the total number of connections
def scroll_to_end(connections, found_connections):
    
    while connections > found_connections:
        time.sleep(randint(2,4))
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            driver.find_element_by_xpath('//button[@class="artdeco-button artdeco-button--muted artdeco-button--1 artdeco-button--full artdeco-button--secondary ember-view scaffold-finite-scroll__load-button"]').click()
        except:
            pass
        time.sleep(randint(2,4))
        found_connections = get_visible_connections()

    return None

# Returning to 'My connections' with all the contacts visible
def restart():
    
    click_my_network()
    time.sleep(randint(4,8))
    click_my_connections()
    total = get_connections()
    found = get_visible_connections()
    scroll_to_end(total, found)
    
    return None

#Getting all the contact details
def get_details():

    # Country
    try:
        country = driver.find_element_by_xpath('//div[@class="pb2 pv-text-details__left-panel"]/span[@class="text-body-small inline t-black--light break-words"]').text.split(', ')[-1]
    except:
        country = None

    # Job-title and Company
    for elem in driver.find_elements_by_class_name('mt4'):
        try:
            if elem.text.split()[0] == 'Experience':
                job_title = elem.find_element_by_xpath('*//div[@class="display-flex align-items-center"]').text.split('\n')[0]
                company = elem.find_element_by_xpath('*//span[@class="t-14 t-normal"]').text.split('\n')[0].split(' ·')[0]
                break
        except IndexError:
            job_title, company = None, None

    # Email 
    driver.find_element_by_id('top-card-text-details-contact-info').click()
    time.sleep(randint(2,4))

    try:
        email = driver.find_element_by_xpath('//section[@class="pv-contact-info__contact-type ci-email"]/div').text
    except:
        email = None

    # Close window
    driver.find_element_by_css_selector("[aria-label=Dismiss]").click()

    return job_title, company, country, email

# Searching for connections not in the database
def contact_scraping(clicks=0):
    
    start_time = time.time()
    
    names = []
    descriptions = []
    job_titles = []
    companies = []
    countries = []
    emails = []


    database = {'Name': names, 'Description': descriptions, 'Company': companies, 'Job Title': job_titles, 'Country': countries, 'Email': emails}

    for contact in driver.find_elements_by_xpath('.//div[@class="mn-connection-card__details"]/a'):
        if clicks == 100:
            print(f'{clicks} reached!')
            break
        else:
            name = contact.find_element_by_xpath('.//span[@class="mn-connection-card__name t-16 t-black t-bold"]').text
            description = contact.find_element_by_xpath('.//span[@class="mn-connection-card__occupation t-14 t-black--light t-normal"]').text

            if [name, description] not in filtered:
                link = contact.get_attribute('href')
                driver.execute_script(f'window.open("{link}","_blank");')
                time.sleep(randint(2,5))
                driver.switch_to.window(driver.window_handles[1])
                driver.execute_script('return document.readyState;')


                time.sleep(randint(2,5))
                job_title, company, country, email = get_details()

                names.append(name)
                descriptions.append(description)
                job_titles.append(job_title)
                companies.append(company)
                countries.append(country)
                emails.append(email)

                time.sleep(randint(2,5))
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(randint(2,5))

                clicks += 1
                print(f'Clicks: {clicks}')

    this_df = pd.DataFrame.from_dict(database)
    old_df.append(this_df, ignore_index=True).drop_duplicates(subset=['Email', 'Name'], keep='last').to_csv(f'{database_name}.csv', index=False)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return None

## Prospects

In [4]:
#variables: position and location

In [18]:
def button_click(element_to_click):
    buttons = WebDriverWait(driver, 2).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'button')))
    result = [btn for btn in buttons if btn.text == f'{element_to_click}']
    time.sleep(1)
    result[0].click()
    return None 

def go_down(num):
    for i in range(0, num):
        head = driver.find_element_by_tag_name('body')
        head.send_keys(Keys.ARROW_DOWN)
        time.sleep(2)
        
def scroll_up_element(element):
    t=5
    while t > 0:
        driver.execute_script("arguments[0].scrollBy(0,-999);", element)
        t -= 1
    return None

def find_new_people(position, location):
    search = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//input[@class="search-global-typeahead__input always-show-placeholder"]')))
    search.send_keys(Keys.COMMAND+"a") 
    time.sleep(2)
    search.send_keys(position)
    time.sleep(1)
    search.send_keys(Keys.RETURN)
    time.sleep(2)
    button_click('People')
    time.sleep(5)
    button_click('Locations')
    location_search = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//input[@placeholder="Add a location"]')))
    time.sleep(1)
    location_search.send_keys(location)
    time.sleep(2)
    location_search.send_keys(Keys.DOWN)
    time.sleep(2)
    location_search.send_keys(Keys.RETURN)
    time.sleep(2)
    button_click('Show results')
    return None

def prepare():
    # Go to bottom then to top
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    driver.execute_script("window.scrollTo(document.body.scrollHeight, 0);")
    time.sleep(2)

    # Close messages
    driver.find_elements_by_xpath('//button[@class="msg-overlay-bubble-header__control msg-overlay-bubble-header__control--new-convo-btn artdeco-button artdeco-button--circle artdeco-button--muted artdeco-button--1 artdeco-button--tertiary ember-view"]')[1].click()

    # Go down 1 step
    go_down(1)

    # Find message buttons
    buttons = driver.find_elements_by_tag_name('button')
    msg_btns = [btn for btn in buttons if btn.text == 'Message']
    return msg_btns

In [19]:
role = 'Head of Legal'
place = 'Germany'

find_new_people(role, place)

## Init

In [15]:
driver = start()

In [16]:
login()

In [9]:
# restart()
# contact_scraping()

In [20]:
driver.quit()

## Message prospects

In [None]:
# Message to send
message = 'Dear {}\n\nI would like to invite you to write a paper for the International In-house Counsel Journal about either regulatory, management or litigation successes your legal team have had in {}.\n\nTo learn more, please visit: https://www.iicj.net/write-a-paper\n\nMichael Bond\nbond@iicj.net'

# Number of clicks
msg_count = 0

# Current page
current_page = 0

# Loop until we reach the click limit or end of results 
while current_page < 2:    
    if msg_count == 100:
        print('Reached maximum messages per day limit')
        break
    else:
        msg_btns = prepare()
        # Loop through every person
        for idx, contact in enumerate(driver.find_elements_by_xpath('//li[@class="reusable-search__result-container "]')):
            # Get name
            name = contact.find_elements_by_xpath('./div[@class="entity-result"]')[0].text.split('\n')[0]
            msg_count += 1 # DELETE
            print('Click {} on {}\nCurrent page: {}\n'.format(msg_count, name, current_page+1)) # DELETE
            msg_btns[idx].click()
            time.sleep(2)
            driver.find_element_by_css_selector("[aria-label=Dismiss]").click() #DELETE
            
            try:
                # Select all messages sent
                pop_msg = driver.find_element_by_class_name('msg-s-message-list')
                time.sleep(2)
                # Go to to top of the conversation to retrieve all messages
                scroll_up_element(pop_msg)
                time.sleep(2)
            except NoSuchElementException:
                pass
            
            # Get all the messages sent
            paragraphs = driver.find_elements_by_tag_name('p')
            
            # Send a message if not previously sent
            if not any([paragraph.text.startswith(message[9:187]) for paragraph in paragraphs]):
                time.sleep(1) # <----- WAIT 1
                for line in message.format(name.split()[0].capitalize(), place).split("\n"):
                    ActionChains(driver).send_keys(line).perform()
                    ActionChains(driver).send_keys(Keys.RETURN).perform()
                    time.sleep(1)
                ActionChains(driver).send_keys(Keys.BACKSPACE).perform()
                time.sleep(1) # <----- WAIT 2
#                 driver.find_element_by_xpath('//button[@type="submit"]').click()
                time.sleep(2)
                msg_count += 1
            else:
                print('{} was messaged previously, continuing loop...'.format(name))
           
            driver.find_element_by_xpath('//button[@data-control-name="overlay.close_conversation_window"]').click()
            time.sleep(2)
            button_click('Discard') # <----- DELETE

            # Scroll down 3 steps
            go_down(3)
            
            # Click next page
            driver.find_element_by_xpath('//button[@aria-label="Next"]').click()
            current_page += 1
            time.sleep(2)
    print('\nEND scraping page {}\n'.format(current_page+1))
    
print('DONE!')

## Messaging contacts

In [14]:
# Message to send
message = 'Dear {}\n\nI would like to invite you to write a paper for the International In-house Counsel Journal about either regulatory, management or litigation successes your legal team have had in {}.\n\nTo learn more, please visit: https://www.iicj.net/write-a-paper\n\nMichael Bond\nbond@iicj.net'

# Go to top
driver.execute_script("window.scrollTo(document.body.scrollHeight, 0);")
time.sleep(2)

# Close messages
driver.find_elements_by_xpath('//button[@class="msg-overlay-bubble-header__control msg-overlay-bubble-header__control--new-convo-btn artdeco-button artdeco-button--circle artdeco-button--muted artdeco-button--1 artdeco-button--tertiary ember-view"]')[1].click()

# Go down 3 steps
go_down(3)

# Scrolling sequence
scrolls = [2, 2, 2, 3]

# Find message buttons
buttons = driver.find_elements_by_tag_name('button')
msg_btns = [btn for btn in buttons if btn.text == 'Message']

# Variable to stop iterations
stop = 0

# Find all peope
for idx, contact in enumerate(driver.find_elements_by_xpath('//li[@class="mn-connection-card artdeco-list"]')):
    if stop >= 25:
        print('REACHED MAXIMUM MESSAGES PER DAY LIMIT')
        break
    else:
        print('Click: {}'.format(stop+1))
        # Get name
        name = contact.find_elements_by_xpath('./div/a/span[@class="mn-connection-card__name t-16 t-black t-bold"]')[0].text
        # Click on 'message' button
        msg_btns[idx].click()
        time.sleep(2)
        try:
            # Select all messages sent
            pop_msg = driver.find_element_by_class_name('msg-s-message-list')
            time.sleep(2)
            # Go to to top of the conversation to retrieve all messages
            scroll_up_element(pop_msg)
            time.sleep(2)
        except NoSuchElementException:
            pass
        
        # Get all the messages sent
        paragraphs = driver.find_elements_by_tag_name('p')
        # Send a message if not previously sent
        if not any([paragraph.text.startswith(message[9:187]) for paragraph in paragraphs]):
            time.sleep(1) # <----- WAIT 1
            for line in message.format(name.split()[0].capitalize(), place).split("\n"):
                ActionChains(driver).send_keys(line).perform()
                ActionChains(driver).send_keys(Keys.RETURN).perform()
                time.sleep(1)
            ActionChains(driver).send_keys(Keys.BACKSPACE).perform()
            time.sleep(1) # <----- WAIT 2
#             driver.find_element_by_xpath('//button[@type="submit"]').click()
            time.sleep(2)
            stop += 1
        else:
            print('{} was messaged previously, continuing loop...'.format(name))
           
    # Close conversation
    driver.find_element_by_xpath('//button[@data-control-name="overlay.close_conversation_window"]').click()
    time.sleep(2)
    button_click('Discard') # <----- DELETE 
    
    # Scroll down to next number of sequence
    go_down(scrolls[idx%len(scrolls)])
    
print('DONE!')

Click: 1
Click: 2
Click: 3
Click: 4
Click: 5
Click: 6
Click: 7


KeyboardInterrupt: 