## Data scraping using Selenium

We have decided to scrape data from [Start-Up Nation Finder](https://finder.startupnationcentral.org/) web.  
This website lists 

In [2]:
from selenium import webdriver

from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver import ActionChains
from selenium.webdriver.support.relative_locator import locate_with

import pandas as pd
import numpy as np
import time
import random

In [3]:
def get_driver():
    options = webdriver.FirefoxOptions()
    # options.headless = True
    driver = webdriver.Firefox(options = options, service=Service(GeckoDriverManager().install()))
    return driver

In [4]:
def rest(a,b) -> None:
    time.sleep(random.uniform(a,b))

In [237]:
# def get_url_list(driver):
#     body = driver.find_element(By.CLASS_NAME,"company-cards-list")
#     companies = body.find_elements(By.CLASS_NAME,"box-view-item")
#     links = []
#     for i, company in enumerate(companies):
#         links.append(company.find_element(By.TAG_NAME,"a").get_attribute("href"))
#         if i % 5 == 0:
#             WAIT_TIME = random.uniform(0,2)
#             time.sleep(WAIT_TIME)
        
#     driver.quit()
#     return links




In [238]:
# driver = get_driver()
# url = "https://finder.startupnationcentral.org/startups/search"
# driver.get(url)

# links = get_url_list(driver)



Current firefox version is 99.0
Get LATEST geckodriver version for 99.0 firefox
Driver [C:\Users\matan\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


In [5]:
def get_data_from_page(page):
    """Get the data from a given page\n
    page is a WebDriver object.\n
    Return one line df"""

    d = {}
    d.update(get_profile_data(page))
    d.update(get_products_and_geomarkets(page))
    d.update(get_fund_data(page))
    d.update(get_listing_data(page))
    d.update(get_clasiffication_data(page))
    d.update(get_tags_and_markets(page))

    return pd.DataFrame([d])
        



In [6]:
def get_products_and_geomarkets(driver):
    d = {}
    company_profile = driver.find_element(By.CLASS_NAME,"zyno-card-4")
    titles = company_profile.find_elements(By.CLASS_NAME,"section-title")
    for title in titles:
        value = driver.find_element(locate_with(By.TAG_NAME,"div").below(title))
        d.update({title.text.lower() : value.text.lower()})
    return d

In [7]:
def get_profile_data(page) -> dict:

    name = page.find_element(By.CLASS_NAME,"top-profile-section").find_element(By.CLASS_NAME,"title").text
    about = page.find_element(By.CLASS_NAME,"about").text
    d = {'company_name' : name, 'company_about': about}

    company_profile = page.find_element(By.CLASS_NAME,"zyno-card-4")
    for info in company_profile.find_elements(By.CLASS_NAME,"metadata-item"):
        var = info.find_element(By.CLASS_NAME,"item-bottom").text
        value = info.find_element(By.CLASS_NAME,"metadata-description").text
        d.update({var.lower() : value})

    # status : PRIVATE / PUBLIC / ACQUIRED / NOT ACTIVE  
    status = 'active'

    try:
        topbar = page.find_element(By.CLASS_NAME,"top-bar-wrapper")
        if "Not Active" in topbar.text:
            status = 'not_active'
        
    except:
        # topbar = None
        pass

    d.update({'status' : status})
 

    return d

In [8]:
def get_clasiffication_data(page) -> dict:
    d = {} 
    classifications = page.find_element(By.CLASS_NAME, "js-startup-classification-section").find_elements(By.CLASS_NAME,"classification-item")
    classifications_list = []

    for cls in classifications:

        elements = cls.find_elements(By.CLASS_NAME,"js-lead-item")
        title = "_".join(cls.find_element(By.CLASS_NAME,"classification-title").text.lower().split(" "))
        for elm in elements:
            elm_title = elm.find_element(By.CLASS_NAME,"row-container").text
            classifications_list.append(f"{title}_{elm_title}")

            for subject in elm.find_elements(By.CLASS_NAME,"js-child-item"):
                classifications_list.append(f"{title}_{elm_title}_{subject.text}")

    for elm in classifications_list:
        d.update({elm : 1})
    
    return d

In [9]:
def get_tags_and_markets(page) -> dict:
    """scrape TAGS and TARGET MARKETS"""
    
    d= {}
    tags_and_markets_list =  page.find_elements(By.CLASS_NAME, "tags-wrapper")
    
        # scrape TAGS 
    try:
        tags = [tag.text for tag in tags_and_markets_list[0].find_elements(By.CLASS_NAME,"label")]
        for tag in tags:
            d.update({f"tag_{tag}": 1})
    except:
        tags = None

        # scrape TARGET MARKETS
    try:
        markets = [market.text for market in tags_and_markets_list[1].find_elements(By.CLASS_NAME,"label")]
        for market in markets:
            d.update({f"targetmarket_{market}": 1})

    except:
        markets = None 
    
    return d

In [10]:
def get_fund_data(page) -> dict:
    d={}
    try:
        fund_data = [x.text for x in page.find_element(By.CLASS_NAME, "funding-metadata").find_elements(By.CLASS_NAME,"title")]
    except:
        fund_data = [np.nan, np.nan, np.nan, np.nan]

    d.update({'fund_stage':fund_data[0], 'total_raised':fund_data[1], 'total_rounds':fund_data[2], 'investors': fund_data[3]})
    return d

In [11]:
def get_listing_data(page) -> dict:
    try:
        d = {}
        topbar = page.find_element(By.CLASS_NAME,"top-bar-wrapper")
        if "Public" in topbar.text:
            ipo_price = topbar.find_element(By.CLASS_NAME,"right").find_element(By.CLASS_NAME,'bold').text
            d.update({'ipo_price':ipo_price})
        
    except:
        d.update({'ipo_price':np.nan})
        
    return d

In [12]:
links = []

with open("data/full_links_list.txt", "r") as f:
    for line in f:
        links.append(line.strip('\n'))

len(links)

13102

In [51]:
df = pd.read_csv('df0_1000.csv')
# driver = get_driver()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
df = pd.DataFrame()
driver = get_driver()
LONG_WAIT = 10 # Minutes

for i, link in enumerate(links[5000:]):
    try:
        driver.get(link)
        df = pd.concat([df,get_data_from_page(driver)], ignore_index=True)
        if i % 100 == 0:
            print(i+1)
        if (i+1) % 500 == 0:
            print(f"Sleeping for {LONG_WAIT} minutes")
            time.sleep(60 * LONG_WAIT)
        else:
            rest(2,5)
    except Exception as e:
        print(f"Error on page {i} -> {str(e)}")
    

driver.quit()
df



Current firefox version is 100.0
Get LATEST geckodriver version for 100.0 firefox
Driver [C:\Users\matan\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


1
101
Error on page 102 -> Message: Unable to locate element: .js-startup-classification-section
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:183:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:395:5
element.find/</<@chrome://remote/content/marionette/element.js:300:16

201
301
401
Sleeping for 10 minutes
501
601
701
801
901
Error on page 993 -> Message: Unable to locate element: .js-startup-classification-section
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:183:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:395:5
element.find/</<@chrome://remote/content/marionette/element.js:300:16

Error on page 996 -> Message: Unable to locate element: .js-startup-classification-section
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:183:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:395:5
element.find/</<@chrome://remote/cont

Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,tag_lead-acid-batteries,tag_car-audio,tag_fuel-management,tag_trip,tag_derms,tag_flexible-heating-fabric,tag_outwear,tag_cars-heating,tag_medical-heat-treatment,tag_augmented-sound
0,CargoZone Workspace,CargoZone specializes in helping organizations...,1/2020,B2B,1-10,Pre-Seed,$350K,Beta,active,"americas, north america, europe, asia, israel",...,,,,,,,,,,
1,Hyperspace,"Hyperspace provides a purpose-built, high-perf...",2/2021,B2B,1-10,Pre-Seed,,Alpha,active,,...,,,,,,,,,,
2,DataWiz,DataWiz is developing a platform using busines...,7/2021,B2B,1-10,Bootstrapped,,R&D,active,,...,,,,,,,,,,
3,TUATARIX,"Tuatarix provides a complete, end-to-end digit...",7/2021,B2B,1-10,Bootstrapped,,Customer development,active,,...,,,,,,,,,,
4,Rupert,Rupert is a platform that integrates with anal...,5/2019,B2B,1-10,Seed,,Beta,active,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8055,TriEye,TriEye is a fabless semiconductor company that...,11/2017,B2B,51-200,ROUND A,$96M,R&D,active,global,...,,,,,,,,,,
8056,LYNX Smartcars,LYNX is developing software for connected and ...,1/2016,B2B,1-10,Bootstrapped,,R&D,not_active,,...,,,,,,,,,,
8057,Deeyook Location Technologies,Deeyook seeks to redefine location technology ...,3/2017,B2B,11-50,Seed,,Released,active,global,...,,,,,,,,,,
8058,SafeCue,SafeCue combines the power of deep learning wi...,1/2016,B2B,1-10,Seed,$500K,Beta,not_active,"asia, germany, india, united states",...,,,,,,,,,,


In [14]:
df.to_csv('df5000_13102.csv')

In [15]:
df_complete = pd.concat([pd.read_csv('df0_5000.csv'), df])
df_complete.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(13048, 2870)

In [16]:
df_complete.to_csv('df_complete.csv')

In [147]:
tag_cols = [col for col in df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df.columns if col.startswith('targetmarket_')]

df_test = pd.get_dummies(df, columns=tag_cols)
df_test = pd.get_dummies(df_test, columns=targetmarket_cols)
df_test

Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,targetmarket_seniors_1.0,targetmarket_emerging-markets_1.0,targetmarket_hospitals_1.0,targetmarket_supermarkets_1.0,targetmarket_electric-utilities_1.0,targetmarket_advertisers_1.0,targetmarket_healthcare-providers_1.0,targetmarket_public-utilities_1.0,targetmarket_students_1.0,targetmarket_service-providers_1.0
0,Tastewise,Tastewise is the AI platform food brands use t...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,0,0,0,0,0,0,0,0,0,0
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,0,0,0,0,0,0,0,0,0,0
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,0,0,0,0,0,0,0,0,0,0
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,0,0,0,0,0,0,0,0,0,0
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,DriftSense,DriftSense creates multilayered solutions for ...,7/2019,"B2B, B2G",1-10,Pre-Seed,,R&D,active,global,...,0,0,0,0,0,0,0,0,0,0
96,Xiologic,Xiologic promotes and commercializes organic p...,3/2018,"B2B, B2C",1-10,Bootstrapped,,Released,active,global,...,0,0,0,0,0,0,0,0,0,0
97,Taranis,Taranis offers a precision intelligence platfo...,5/2014,B2B,51-200,ROUND C+,$59.55M,Released,active,"south america, asia, argentina, australia, bra...",...,0,0,0,0,0,0,0,0,0,0
98,Optiguide,Optiguide offers solutions designed to maintai...,5/2004,B2B,11-50,Revenue Financed,,Released,active,"north america, south america, europe, china, i...",...,0,0,0,0,0,0,0,0,0,0


In [148]:
tag_cols = [col for col in df_test.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_test.columns if col.startswith('targetmarket_')]

In [149]:
#  replace all tag columns in tags_vector col.
def vectorize_and_replace(df, cols: list, prefix: str):
    """Vectorize the {cols} columns in {dataframe}\n
    and returns a {dataframe} copy with the vector after removing the {cols}\n
    new col name is {prefix}_vec """
    # df = dataframe.copy()
    from sklearn import preprocessing

    lb = preprocessing.LabelBinarizer()
    vec = lb.fit_transform(df[cols]).tolist()

    df[f"{prefix}_vec"] = vec
    df = df.drop(cols, axis = 1)
    return df


In [156]:
df.shape

(100, 560)

In [143]:
print([col for col in df.columns])

['company_name', 'company_about', 'founded', 'business model', 'employees', 'funding stage', 'raised', 'product stage', 'status', 'geographical markets', 'products', 'fund_stage', 'total_raised', 'total_rounds', 'investors', 'ipo_price', 'sector_AgriFood-tech & Water', 'target_industry_Food Retail & Consumption', 'target_industry_Food Retail & Consumption_Home', 'target_industry_Food Retail & Consumption_Restaurants', 'target_industry_Food Retail & Consumption_Wholesale', 'core_technology_Artificial Intelligence', 'tag_computer-vision', 'tag_behavior-analytics', 'tag_machine-learning', 'tag_consumer-packaged-goods', 'tag_natural-language-processing', 'tag_foodtech', 'tag_personalization', 'tag_artificial-intelligence', 'tag_data-analytics', 'tag_predictive-analytics', 'tag_food-ingredients', 'tag_business-intelligence', 'targetmarket_retailers', 'targetmarket_food-and-beverage', 'targetmarket_restaurants', 'sector_AgriFood-tech & Water_Alternative Food Sources', 'target_industry_Agricu

In [144]:
df.to_csv("example_df.csv")

In [150]:
df2 = df_test.copy()
df2 = vectorize_and_replace(df2, tag_cols, "tags")
df2 = vectorize_and_replace(df2, targetmarket_cols, "targetmarket")
df2

Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,sector_Retail & Marketing,sector_Retail & Marketing_Social Commerce,target_industry_Consumers_Recreational & Lifestyle,target_industry_Enterprise & Professional Services_Enterprises,core_technology_Materials & Substances_Nanomaterials,sector_Security Technologies_Emergency Response,sector_Energy-tech_Energy Storage,core_technology_Platforms & Interfaces_API,tags_vec,targetmarket_vec
0,Tastewise,Tastewise is the AI platform food brands use t...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,,,,,,,,,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,,,,,,,,,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,,,,,,,,,"[0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,,,,,,,,,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,DriftSense,DriftSense creates multilayered solutions for ...,7/2019,"B2B, B2G",1-10,Pre-Seed,,R&D,active,global,...,,,,,,,,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, ..."
96,Xiologic,Xiologic promotes and commercializes organic p...,3/2018,"B2B, B2C",1-10,Bootstrapped,,Released,active,global,...,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
97,Taranis,Taranis offers a precision intelligence platfo...,5/2014,B2B,51-200,ROUND C+,$59.55M,Released,active,"south america, asia, argentina, australia, bra...",...,,,,,,,,,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
98,Optiguide,Optiguide offers solutions designed to maintai...,5/2004,B2B,11-50,Revenue Financed,,Released,active,"north america, south america, europe, china, i...",...,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [154]:
print([col for col in df2.columns])

['company_name', 'company_about', 'founded', 'business model', 'employees', 'funding stage', 'raised', 'product stage', 'status', 'geographical markets', 'products', 'fund_stage', 'total_raised', 'total_rounds', 'investors', 'ipo_price', 'sector_AgriFood-tech & Water', 'target_industry_Food Retail & Consumption', 'target_industry_Food Retail & Consumption_Home', 'target_industry_Food Retail & Consumption_Restaurants', 'target_industry_Food Retail & Consumption_Wholesale', 'core_technology_Artificial Intelligence', 'sector_AgriFood-tech & Water_Alternative Food Sources', 'target_industry_Agriculture & Food', 'target_industry_Agriculture & Food_Livestock', 'core_technology_Biologicals', 'core_technology_Biologicals_Cells', 'sector_AgriFood-tech & Water_Food Processing', 'target_industry_Agriculture & Food_Processed Foods', 'core_technology_Materials & Substances', 'sector_AgriFood-tech & Water_Yield Optimization & Harvest', 'core_technology_Machinery & Robotics', 'core_technology_Sensing

In [None]:
sec = [col for col in df_test.columns if col.startswith('sector')]
cor = [col for col in df_test.columns if col.startswith('core')]
sec = [col for col in df_test.columns if col.startswith('target')]
sec = [col for col in df_test.columns if col.startswith('sector')]