# Loading Related Modules

In [1]:
import json
import logging
from pprint import pprint
from time import sleep
import re


In [2]:
import requests
from lxml import html
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch


# Scraping Webpage  

In [3]:
url = 'https://www.cbinsights.com/research-unicorn-companies'
req = requests.get(url)
page = req.text
soup = BeautifulSoup(page, 'lxml')
print soup.prettify()

<!DOCTYPE html>
<html>
 <head>
  <meta content="" name="iy453p9485yheisruhs5"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <title>
   The Complete List of Unicorn Companies
  </title>
  <meta content="A unicorn company is a startup with a valuation over $1 billion. This is an ongoing list of all unicorn companies in the U.S. and abroad.  " name="description"/>
  <meta content="unicorn companies, billion dollar startups, unicorn startups" name="keywords"/>
  <link href="https://www.cbinsights.com/research-unicorn-companies" rel="canonical"/>
  <meta content="en_US" property="og:locale"/>
  <meta content="article" property="og:type"/>
  <meta content="The Complete List of Unicorn Companies" property="og:title"/>
  <meta content="A unicorn company is a startup with a valuation over $1 billion. This is an ongoing list of all unicorn companies in the U.S. and abroad.  " property="og:description"/

In [4]:
"""Select data table content"""
table = soup.select('.sortable-theme-bootstrap')[0]

# Parsing Content Table Data

In [5]:
def parse_table(table):
    """ Get data from table """
    
    parsed = []
    for row in table.find_all("tr"):
        for cell in row.find_all(['td','th']):
            parsed.append(cell.get_text())
            
    header = parsed[:6]
    result = []
    
    """convert list mappings into dictionary """
    
    for row in [parsed[i:i + 6] for i in range(6, len(parsed), 6)]:
        temp ={}
        for n in range(6):
            ## data cleaning : remove extra characters /n/t, $symbol
            row[n] = re.sub('\s+', ' ', row[n])
            row[n] =row[n].replace("$", "")

            temp[header[n]]=row[n]
            
        result.append(temp)        
            
    return result

In [6]:
unicorn = parse_table(table)

In [7]:
unicorn 

[{u'Company': u'Uber',
  u'Country': u'United States',
  u'Date Joined': u'8/23/2013',
  u'Industry': u'On-Demand',
  u'Select Investors': u'Lowercase Capital, Benchmark Capital, Google Ventures',
  u'Valuation ($B)': u'72'},
 {u'Company': u'Didi Chuxing ',
  u'Country': u'China',
  u'Date Joined': u'12/31/2014',
  u'Industry': u'On-Demand',
  u'Select Investors': u'Matrix Partners, Tiger Global Management, Softbank Corp.,',
  u'Valuation ($B)': u'56'},
 {u'Company': u'Airbnb',
  u'Country': u'United States',
  u'Date Joined': u'7/26/2011',
  u'Industry': u'eCommerce/Marketplace',
  u'Select Investors': u'General Catalyst Partners, Andreessen Horowitz, ENIAC Ventures',
  u'Valuation ($B)': u'29.3'},
 {u'Company': u'SpaceX',
  u'Country': u'United States',
  u'Date Joined': u'12/1/2012',
  u'Industry': u'Other Transportation',
  u'Select Investors': u'Founders Fund, Draper Fisher Jurvetson, Rothenberg Ventures',
  u'Valuation ($B)': u'21.5'},
 {u'Company': u'Palantir Technologies',
  u'

# Option 1 - Query through Kibana DevTool Console

### Create json file for bulk import

In [8]:
"""In order to create json file for bulk import, we shall add action data for each index"""

action_list = []
for i in range(len(unicorn)):
    # create index for each entry
    action = { "index" : { "_index" : "unicorn", "_type" : "default", "_id" : i+1 } }
    action_list.append(action)

In [9]:
"""create json file in specific format for bulk API"""
## action_and_meta_data\n
## optional_source\n

with open('unicorn.json', 'w') as outfile:
    for (idx, line) in zip(action_list, unicorn):
        json.dump(idx, outfile)
        outfile.write('\n')
        json.dump(line, outfile)
        outfile.write('\n')

### Create index and mappings


PUT /unicorn

{
  "mappings":{   
  
    "default":{
      "properties": {
        "Company":{
          "type": "text"
        },
        
        "Country":{
          "type": "text",
          "fields": {
            "keyword":{
              "type": "keyword"
            }
          }
        },
        
        "Date Joined":{
          "type": "date",
          "format": "MM/dd/yyyy"
        },
        "Industry":{
          "type": "text",
        },
        "Select Investors":{
          "type": "text",
          "fields": {
            "keyword":{
              "type": "text"
            }
          }
        },
        "Valuation ($B)": {
          "type": "float"
        }
      }
    }
  }
}

### Index documents via bulk

 $ curl -s -H "Content-Type: application/xOST localhost:9200/unicorn/_bulk --data-binary "@unicorn.json"

### Query Examples 

""" we can use Kibana Devtool to do the query"""

GET /unicorn/default/_search

{
  "size": 0,
  "aggs": {
    "date_range": {
      "range": {
        "field": "Date Joined",
        "ranges": [
          {
            "to": "01/01/2017||+1y",
            "from": "01/01/2017"
            
          }
        ]
      }
    }
  }
}

# Option 2 - Query by ES Python API

### Create Index

In [10]:
def create_index(es_object, index_name):
    created = False
    # index settings
    settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
    "mappings": {
    "default":{
      "dynamic": "strict",
      "properties": {
       "Company":{
          "type": "text"
        },
        
        "Country":{
          "type": "text",
          "fields": {
            "keyword":{
              "type": "keyword"
            }
          }
        },
        
        "Date Joined":{
          "type": "date",
          "format": "MM/dd/yyyy"
        },
        "Industry":{
          "type": "text",
        },
        "Select Investors":{
          "type": "text",
          "fields": {
            "keyword":{
              "type": "text"
            }
          }
        },
        "Valuation ($B)": {
          "type": "float"
        }
      }
    }
  }
}


    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            print('Created Index')
        created = True
    except Exception as ex:
        print(str(ex))
    finally:
        return created

In [11]:
def store_record(elastic_object, index_name, idx, record):
    is_stored = True
    try:
        outcome = elastic_object.index(index=index_name, doc_type='default', id= idx, body=record)
        print(outcome)
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))
        is_stored = False
    finally:
        return is_stored

In [12]:
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Yay Connected')
    else:
        print('Awww it could not connect!')
    return _es

In [13]:
def search(es_object, index_name, search ):
    res = es_object.search(index = index_name, body = search)
    pprint(res)

### Adds document in a specific index

In [15]:
es = connect_elasticsearch()

for idx, row in enumerate(unicorn):
    if es is not None:
        if create_index(es, 'unicorn'):
            out = store_record(es, 'unicorn', idx, row)
            print('Data indexed successfully')

Yay Connected
{u'_type': u'default', u'_seq_no': 261, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}, u'_index': u'unicorn', u'_version': 2, u'_primary_term': 1, u'result': u'updated', u'_id': u'0'}
Data indexed successfully
{u'_type': u'default', u'_seq_no': 262, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}, u'_index': u'unicorn', u'_version': 2, u'_primary_term': 1, u'result': u'updated', u'_id': u'1'}
Data indexed successfully
{u'_type': u'default', u'_seq_no': 263, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}, u'_index': u'unicorn', u'_version': 2, u'_primary_term': 1, u'result': u'updated', u'_id': u'2'}
Data indexed successfully
{u'_type': u'default', u'_seq_no': 264, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1}, u'_index': u'unicorn', u'_version': 2, u'_primary_term': 1, u'result': u'updated', u'_id': u'3'}
Data indexed successfully
{u'_type': u'default', u'_seq_no': 265, u'_shards': {u'successful': 1, u'failed': 0, u'total': 1},

### Execute search requests

In [16]:
es = connect_elasticsearch()

search_object_1 = {
  "query":{
    "match": {
    "Country": "United States"
    }
  }
}


search_object_2 = {
  "query": {
    "match": {
      "Select Investors.keyword":  "SoftBank Group"
      }
    }
  }


search(es, 'unicorn', search_object_2)

Yay Connected
{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'hits': {u'hits': [{u'_id': u'20',
                      u'_index': u'unicorn',
                      u'_score': 5.964426,
                      u'_source': {u'Company': u'Manbang Group',
                                   u'Country': u'China',
                                   u'Date Joined': u'4/24/2018',
                                   u'Industry': u'Supply chain & Logistics',
                                   u'Select Investors': u'Softbank Group, CapitalG',
                                   u'Valuation ($B)': u'6'},
                      u'_type': u'default'},
                     {u'_id': u'30',
                      u'_index': u'unicorn',
                      u'_score': 4.787507,
                      u'_source': {u'Company': u'Fanatics',
                                   u'Country': u'United States',
                                   u'Date Joined': u'6/6/2012',
                 