In [171]:
from selenium import webdriver

from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver import ActionChains


import pandas as pd
import numpy as np
import time
import random

In [172]:
def get_driver():
    options = webdriver.FirefoxOptions()
    # options.headless = True
    driver = webdriver.Firefox(options = options, service=Service(GeckoDriverManager().install()))
    return driver

In [237]:
def get_url_list(driver):
    body = driver.find_element(By.CLASS_NAME,"company-cards-list")
    companies = body.find_elements(By.CLASS_NAME,"box-view-item")
    links = []
    for i, company in enumerate(companies):
        links.append(company.find_element(By.TAG_NAME,"a").get_attribute("href"))
        if i % 5 == 0:
            WAIT_TIME = random.uniform(0,2)
            time.sleep(WAIT_TIME)
        
    driver.quit()
    return links




In [238]:
driver = get_driver()
url = "https://finder.startupnationcentral.org/startups/search"
driver.get(url)

links = get_url_list(driver)



Current firefox version is 99.0
Get LATEST geckodriver version for 99.0 firefox
Driver [C:\Users\matan\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


In [233]:
def get_data_from_page(page):
    """Get the data from a given page\n
    page is a WebDriver object.\n
    Return one line df"""

    d = {}
    d.update(get_profile_data(page))
    d.update(get_fund_data(page))
    d.update(get_listing_data(page))
    d.update(get_clasiffication_data(page))
    d.update(get_tags_and_markets(page))

    return pd.DataFrame([d])
        



In [223]:
def get_profile_data(page) -> dict:

    name = page.find_element(By.CLASS_NAME,"top-profile-section").find_element(By.CLASS_NAME,"title").text
    about = page.find_element(By.CLASS_NAME,"about").text
    d = {'company_name' : name, 'company_about': about}

    company_profile = page.find_element(By.CLASS_NAME,"zyno-card-4")
    for info in company_profile.find_elements(By.CLASS_NAME,"metadata-item"):
        var = info.find_element(By.CLASS_NAME,"item-bottom").text
        value = info.find_element(By.CLASS_NAME,"metadata-description").text
        d.update({var.lower() : value})


    titles = []
    texts = []

    divs = company_profile.find_elements(By.TAG_NAME,"div")
    separator = False
    i = 0
    while separator is False:
        if divs[i].get_attribute("class") == "separator":
            separator = True
        i += 1
    
    while True:
        titles.append(divs[i].text.lower())
        texts.append(divs[i+1].text.split(','))
        i += 3
        if divs[i].get_attribute("class") == 'general-info-item-wrapper':
            break
    
    for ttl, txt in zip(titles, texts):
        d.update({ttl : txt})

    return d

In [224]:
def get_clasiffication_data(page) -> dict:
    d = {} 
    classifications = page.find_element(By.CLASS_NAME, "js-startup-classification-section").find_elements(By.CLASS_NAME,"classification-item")
    classifications_list = []

    for cls in classifications:

        elements = cls.find_elements(By.CLASS_NAME,"js-lead-item")

        for elm in elements:
            elm_title = elm.find_element(By.CLASS_NAME,"row-container").text
            classifications_list.append(elm_title)

            for subject in elm.find_elements(By.CLASS_NAME,"js-child-item"):
                classifications_list.append(subject.text)

    for elm in classifications_list:
        d.update({elm : 1})
    
    return d

In [225]:
def get_tags_and_markets(page) -> dict:
    """scrape TAGS and TARGET MARKETS"""
    
    d= {}
    tags_and_markets_list =  page.find_elements(By.CLASS_NAME, "tags-wrapper")
    
        # scrape TAGS 
    try:
        tags = [tag.text for tag in tags_and_markets_list[0].find_elements(By.CLASS_NAME,"label")]
        for tag in tags:
            d.update({f"tag_{tag}": 1})
    except:
        tags = None

        # scrape TARGET MARKETS
    try:
        markets = [market.text for market in tags_and_markets_list[1].find_elements(By.CLASS_NAME,"label")]
        for market in markets:
            d.update({f"targetmarket_{market}": 1})

    except:
        markets = None 
    
    return d

In [226]:
def get_fund_data(page) -> dict:
    d={}
    try:
        fund_data = [x.text for x in page.find_element(By.CLASS_NAME, "funding-metadata").find_elements(By.CLASS_NAME,"title")]
    except:
        fund_data = [np.nan, np.nan, np.nan, np.nan]

    d.update({'fund_stage':fund_data[0], 'total_raised':fund_data[1], 'total_rounds':fund_data[2], 'investors': fund_data[3]})
    return d

In [227]:
def get_listing_data(page) -> dict:
    d = {}
    try:
        listing_data = page.find_element(By.CLASS_NAME,"po-wrapper").find_elements(By.CLASS_NAME,"column")
        ipo_price = listing_data[-1].text
        d.update({'ipo_price':ipo_price})
    except:
        d.update({'ipo_price':np.nan})
    return d

In [228]:
df = pd.DataFrame()
driver = get_driver()
WAIT_TIME = random.uniform(1,3)
for link in links[:5]:
    driver.get(link)
    df = pd.concat([df,get_data_from_page(driver)], ignore_index=True)
    time.sleep(WAIT_TIME)
    
    
    
driver.quit()
df



Current firefox version is 99.0
Get LATEST geckodriver version for 99.0 firefox
Driver [C:\Users\matan\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,product stage,geographical markets,products,fund_stage,...,tag_medical-devices,tag_mobile-applications,tag_medical-technologies,tag_breastfeeding,tag_non-invasive,tag_digital-healthcare,tag_digital-therapeutics,tag_monitoring,targetmarket_mothers,targetmarket_babies
0,Hypervision,Hypervision is building the next generation of...,10/2019,B2B,1-10,Seed,R&D,[Israel],"[XR/RR270, HyperOcular]",Seed,...,,,,,,,,,,
1,Golan Plastic Products,Golan Plastic Products (GPP) specializes in de...,1/1964,"B2B, B2G",11-50,Public,Released,"[South America, Europe, Africa, Canada, Un...","[PEXGOL, MULTYGOL]",,...,,,,,,,,,,
2,Zooz Power,Zooz Power provides flywheel energy storage te...,4/2013,B2B,11-50,Public,Released,"[North America, Europe, Western Europe, Aus...",[Kinetic Power Booster],Public,...,,,,,,,,,,
3,Zohar CleanTech,"Zohar CleanTech is the creator of ZoharX, a pa...",2/2017,"B2B, B2G",1-10,Seed,Alpha,[Global],[ZoharX],Seed,...,,,,,,,,,,
4,Annabella,Annabella has developed a breast pump that mim...,1/2017,"B2B, B2C, B2B2C",1-10,Seed,Beta,[Global],[Annabella],Seed,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [231]:
df.sum()

  df.sum()


company_name                HypervisionGolan Plastic ProductsZooz PowerZoh...
company_about               Hypervision is building the next generation of...
founded                                       10/20191/19644/20132/20171/2017
business model                          B2BB2B, B2GB2BB2B, B2GB2B, B2C, B2B2C
employees                                              1-1011-5011-501-101-10
                                                  ...                        
tag_digital-healthcare                                                    1.0
tag_digital-therapeutics                                                  1.0
tag_monitoring                                                            1.0
targetmarket_mothers                                                      1.0
targetmarket_babies                                                       1.0
Length: 97, dtype: object