In [47]:
#Run this cell first, it will open a chromedriver

#import libraries
import selenium 
from selenium import webdriver
import time
import pickle
import itertools
from bs4 import BeautifulSoup
import pickle

#setup and return driver
def setup_driver():
    #add header(User-Agent) so website thinks it's an authentic request
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--user-agent=Chrome/74.0.3729.131")
    driver = webdriver.Chrome("./chromedriver", options=chrome_options)
    
    driver.get("https://investovc.com/fundadores")
    return driver

#get all of the fund elements
def get_funds(driver):
    funds_container = driver.find_elements_by_xpath("//div[@class='panel-group']")[0]
    funds = funds_container.find_elements_by_xpath("//div[contains(@class, 'panel-heading panel-head')]")
    return funds

#get the fund title
def get_fund_title(ind, fund):
    title_elm = fund.find_elements_by_xpath("//h3[@class='panel-title']")[ind]
    return title_elm.get_attribute("innerHTML")

#get startups for fund
def get_startups_for_fund(ind, driver):
    desc_elm = driver.find_elements_by_xpath("//h4[@class='panel-title']")[ind]
    desc_elm = desc_elm.get_attribute("innerHTML")
    desc_elm = [int(s) for s in desc_elm.split() if s.isdigit()][0]
    return desc_elm

#clean the description
def clean_description(description):
    description= description.replace(u'\xa0', u' ')
    description = description.replace('Show more>', '')
    description =description.replace('Show less>', '')
    description = description.replace('...', '')
    return description 

#generic clean text
def clean_text(loc):
    loc = loc.replace('\n',"")
    loc = loc.replace('<p>',"")
    loc = loc.replace('</p>',"")
    loc = loc.replace('-',"")
    return loc

def get_startup_info_from_panel(driver):
    panel = driver.find_elements_by_xpath("//div[@class='sweet-alert sa-investors-modal showSweetAlert visible']")
    
    if len (panel) == 0 :
        return None
    panel_html = panel[0].get_attribute("innerHTML")
    soup = BeautifulSoup(panel_html, 'html.parser')
    company_name = soup.find_all('h3')[0].text
    location = soup.find_all('p')[1].text
    description = soup.find('span',{'class':'tagline_style'}).text
    img_url = soup.find('img',{'class': 'panel-startup-logo-image'})['src']
    website = soup.find_all('p')[3].text
    raised = soup.find_all('p')[5].text
    founders_elms = soup.find_all('h3')[1:]
    founders = list({founder_elm.text.strip() for founder_elm in founders_elms})
    
    startup_info = {"name": company_name.strip(),
                    "location": location.strip().replace("\n", ""),
                    "description": description.strip(),
                    "icon_url": img_url.strip(),
                    "website": website.strip(),
                    "amount_raised": raised.strip(),
                    "founders": founders}

    driver.find_elements_by_class_name("confirm")[0].send_keys(u'\ue007')
    time.sleep(1)
    return startup_info

def read_startups(startups, all_info, driver):
    time.sleep(3)
    #get all of the startups
    startup_count = 0

    for fund_count, info in enumerate(all_info):
        amount = info["fund_amt"]
        startups_for_fund = []
        print("Fund: ", fund_count+1)

        for i in range(amount):
            current_startup = startups[startup_count]
            driver.execute_script("arguments[0].scrollIntoView();", current_startup)
            startup_count +=1

            #click and get startup information
            current_startup.click()
            time.sleep(2)
            specific_info = get_startup_info_from_panel(driver)
            if not specific_info == None:
                startups_for_fund.append(specific_info)

            print(i + 1, " of ", amount)

        all_info[fund_count]["startups"] = startups_for_fund
        if "page" in driver.current_url:
            page_num = driver.current_url[driver.current_url.index("-") + 1:]
        else:
            page_num = "1"
        f = open("pckl/" + page_num + ".pckl", 'wb')
        pickle.dump(all_info, f)
        f.close()

def load_page(driver):

    all_info = []
    funds = get_funds(driver)
    funds_len = len(funds)

    #go through all of the funds in the current page, and open them
    for ind in range(funds_len):

        #click to open
        driver.execute_script("arguments[0].scrollIntoView();", funds[ind])
        fund_title = get_fund_title(ind, funds[ind])
        fund_amt = get_startups_for_fund(ind, driver)
        funds[ind].click()
        time.sleep(2)

        all_info.append({"fund_name": fund_title, "fund_amt":fund_amt})

    #description
    time.sleep(10)
    elms = driver.find_elements_by_xpath("//p[@class='more smsl']")
    for i, elm in enumerate(elms):
        text = elm.get_attribute("innerHTML")
        all_info[i]["description"] = BeautifulSoup(clean_description(text)).get_text()

    #location
    count = 0
    count_url = 0
    in_between = 0
    elms = driver.find_elements_by_xpath("//div[@class='tcb-info-block']")
    for i, elm in enumerate(elms):
        text = elm.get_attribute("innerHTML")
        in_between +=1
        if "Location" in text:
            if in_between > 23:
                all_info[count]["location"] = "Not Available on Site"
                count +=1
            all_info[count]["location"] = clean_text(elms[i+1].get_attribute("innerHTML"))
            count +=1
            in_between = 0
        if "Web Presence" in text:
            par = elm.find_element_by_xpath('..')
            html = par.get_attribute("innerHTML")
            soup = BeautifulSoup(html)
            all_info[count_url]["url"] = soup.a.get("href")
            count_url += 1
    
    #click on all the more buttons
    time.sleep(3)
    more_buttons = driver.find_elements_by_xpath("//div[@class='investor_block investor-more-investments']")
    for more_button in more_buttons:
        driver.execute_script("arguments[0].scrollIntoView();", more_button)
        more_button.click()
        time.sleep(3)
    time.sleep(5)
    startups = driver.find_elements_by_xpath("//div[@class='investor_block  quickinfo quickinfo-startup ']")
    
    read_startups(startups, all_info, driver)

#open original page, let it load
c_driver = setup_driver()

In [48]:
#Run this cell after choosing the page you want to scrape, info will be saved in the folders afterwards
time.sleep(5)
#gather information
load_page(c_driver)

#get page num programatically
if "page" in c_driver.current_url:
    page_num = c_driver.current_url[c_driver.current_url.index("-") + 1:]
else:
    page_num = "1"

#load programatically
f = open("pckl/" + page_num + ".pckl", 'rb')
obj = pickle.load(f)
f.close()

#make CSV file
import csv
keys = obj[0].keys()
with open("csv/" + page_num + '.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(obj)

Fund:  1
1  of  1
Fund:  2
1  of  1
Fund:  3
1  of  1
Fund:  4
1  of  1
Fund:  5
1  of  1
Fund:  6
1  of  1
Fund:  7
1  of  1
Fund:  8
1  of  1
Fund:  9
1  of  1
Fund:  10
1  of  1
Fund:  11
1  of  1
Fund:  12
1  of  1
Fund:  13
1  of  1
Fund:  14
1  of  1
Fund:  15
1  of  1
Fund:  16
1  of  1
Fund:  17
1  of  1
