# LinkedIn Scraper

In [15]:
# Scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

# Misc
import time
from random import randint
from pathlib import Path
import time

# Other modules
from secrets import u_name, pwd

#CSV
import pandas as pd
from pandas.errors import EmptyDataError
from csv import writer, reader

### Global variables

In [27]:
# Name of the .csv database
database_name = 'linkedin_database_contacts'

# Variable for not exceeding daily number of clicks
clicks = 0

# Create file if it doesn't exist, otherwise do nothing
file = Path(database_name+".csv")
file.touch(exist_ok=True)

# Get the first two columns of the database
with open(database_name+".csv", 'r') as r_obj:
    csv_r = reader(r_obj)
    filtered = [i[0:2] for i in csv_r]
    r_obj.close()

# Converting the csv to DataFrame object
try:
    old_df = pd.read_csv(str(file))
except EmptyDataError:
    old_df = pd.DataFrame()

### Functions

In [28]:
# Initialize webdriver
def start():
    
    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    driver.get('https://www.linkedin.com/login')
    
    return driver

# Login
def login():
    
    time.sleep(randint(2,4))

    username = driver.find_element_by_xpath('//input[@name="session_key"]')
    password = driver.find_element_by_xpath('//input[@name="session_password"]')

    username.send_keys(u_name)
    password.send_keys(pwd)

    time.sleep(randint(2,4))

    submit = driver.find_element_by_xpath('//button[@type="submit"]').click()
    time.sleep(randint(2,4))
    
    return None

# 'My Network' button
def click_my_network():
    
    my_network = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@data-link-to="mynetwork"]'))).click()

    return None

# 'Connections' button
def click_my_connections():
    
    my_connections = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//div[@class="mn-community-summary__entity-info"]'))).click()
    
    return None

# Getting the total number of connections
def get_connections():

    number_of_connections = int(WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//header[@class="mn-connections__header"]'))).text.split()[0])
    
    return number_of_connections

# Getting the number of visible connections (html)
def get_visible_connections():
    
    visible_connections = len(WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@class="scaffold-finite-scroll__content"]/ul/li[@class="mn-connection-card artdeco-list"]'))))
    
    return visible_connections

# Scrolling down until the number of visible connections in the page found is equal to the total number of connections
def scroll_to_end(connections, found_connections):
    
    while connections > found_connections:
        time.sleep(randint(2,4))
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            driver.find_element_by_xpath('//button[@class="artdeco-button artdeco-button--muted artdeco-button--1 artdeco-button--full artdeco-button--secondary ember-view scaffold-finite-scroll__load-button"]').click()
        except:
            pass
        time.sleep(randint(2,4))
        found_connections = get_visible_connections()

    return None

# Returning to 'My connections' with all the contacts visible
def restart():
    
    click_my_network()
    time.sleep(randint(4,8))
    click_my_connections()
    total = get_connections()
    found = get_visible_connections()
    scroll_to_end(total, found)
    
    return None

#Getting all the contact details
def get_details():

    # Country
    try:
        country = driver.find_element_by_xpath('//div[@class="pb2 pv-text-details__left-panel"]/span[@class="text-body-small inline t-black--light break-words"]').text.split(', ')[-1]
    except:
        country = None

    # Job-title and Company
    for elem in driver.find_elements_by_class_name('mt4'):
        try:
            if elem.text.split()[0] == 'Experience':
                job_title = elem.find_element_by_xpath('*//div[@class="display-flex align-items-center"]').text.split('\n')[0]
                company = elem.find_element_by_xpath('*//span[@class="t-14 t-normal"]').text.split('\n')[0].split(' ·')[0]
                break
        except IndexError:
            job_title, company = None, None

    # Email 
    driver.find_element_by_id('top-card-text-details-contact-info').click()
    time.sleep(randint(2,4))

    try:
        email = driver.find_element_by_xpath('//section[@class="pv-contact-info__contact-type ci-email"]/div').text
    except:
        email = None

    # Close window
    driver.find_element_by_css_selector("[aria-label=Dismiss]").click()

    return job_title, company, country, email

# Searching for connections not in the database
def contact_scraping(clicks=0):
    
    start_time = time.time()
    
    names = []
    descriptions = []
    job_titles = []
    companies = []
    countries = []
    emails = []


    database = {'Name': names, 'Description': descriptions, 'Company': companies, 'Job Title': job_titles, 'Country': countries, 'Email': emails}

    for contact in driver.find_elements_by_xpath('.//div[@class="mn-connection-card__details"]/a'):
        if clicks == 100:
            print(f'{clicks} reached!')
            break
        else:
            name = contact.find_element_by_xpath('.//span[@class="mn-connection-card__name t-16 t-black t-bold"]').text
            description = contact.find_element_by_xpath('.//span[@class="mn-connection-card__occupation t-14 t-black--light t-normal"]').text

            if [name, description] not in filtered:
                link = contact.get_attribute('href')
                driver.execute_script(f'window.open("{link}","_blank");')
                time.sleep(randint(2,5))
                driver.switch_to.window(driver.window_handles[1])
                driver.execute_script('return document.readyState;')


                time.sleep(randint(2,5))
                job_title, company, country, email = get_details()

                names.append(name)
                descriptions.append(description)
                job_titles.append(job_title)
                companies.append(company)
                countries.append(country)
                emails.append(email)

                time.sleep(randint(2,5))
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                time.sleep(randint(2,5))

                clicks += 1
                print(f'Clicks: {clicks}')

    this_df = pd.DataFrame.from_dict(database)
    old_df.append(this_df, ignore_index=True).drop_duplicates(subset=['Email', 'Name'], keep='last').to_csv(f'{database_name}.csv', index=False)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    return None

## Init

In [29]:
driver = start()

In [30]:
login()

In [31]:
restart()

In [None]:
contact_scraping()

In [33]:
driver.quit()