In [159]:
from scrapy import Selector
from selenium import webdriver
import requests
import urlparse
import os
import re
import datetime

In [234]:
# Global variables
XPATH_COMPANY_LIST = '//div[@class="info-block"]/h4/a[contains(@href, "organization")]/@href'
XPATH_COMPANY_FUNDING_DATE = '//h2[@class="title_date"]'
XPATH_COMPANY_FUNDING_ROUND = '//table[@class="table container"]//a/text()'
XPATH_COMPANY_FUNDING_AMOUNT = '//table[@class="table container"]//td[2]/text()'
XPATH_COMPANY_CRUNCHBASE_LINK = '//div[@class="info-block"]//a/@href'
XPATH_COMPANY_SITE_LINK = '//div[@class="definition-list container"]//dd[5]/a/@href'
XPATH_COMPANY_LINKEDIN_LINK = '//dd[@class="social-links"]//a[@class="icons linkedin"]/@href'
XPATH_COMPANY_NAME = '//h1[@id="profile_header_heading"]//a/text()'
XPATH_COMPANY_DESCRIPTION = '//div[@class="definition-list container"]//dd[2]/text()'

XPATH_TEAM_MEMBER_LIST = '//div[@class="base info-tab people"]//ul[@class="section-list container"]'
XPATH_TEAM_MEMBER_FULL_NAME = '//div[@class="info-block"]/div[@class="large"]//a[@class="follow_card"]/text()'
XPATH_TEAM_MEMBER_POSITION = '//div[@class="info-block"]/div[@class="large"]/h5/text()'
XPATH_TEAM_MEMBER_CRUNCHBASE_LINK = '//h4/a[@data-type="person"][@class="follow_card"]/@href'
XPATH_TEAM_MEMBER_LINKEDIN_LINK = '//dd[@class="social-links"]/a[contains(@href, "linkedin")]/@href'
XPATH_TEAM_MEMBER_PERSONAL_DETAILS = '//div[@class="base info-tab description"]//div[@class="card-content box container card-slim"]//text()'

XPATH_CONTENT_BLOCKED = '//meta[contains(@content, "blocked")]'

base_url = 'https://www.crunchbase.com/'
url = 'https://www.crunchbase.com/funding-rounds'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12'}

In [296]:
# Create rendering engine
browser = webdriver.Chrome()

In [None]:
# Get company list raw html content
response = requests.get(url, headers=headers)

In [None]:
# Save company list raw html content on disk 
# for developing and testing purposes
with open('company_list_raw_content.html', 'wb') as fout:
    fout.write(response.content)

In [None]:
# Get company list rendered html content
browser.get('file:///{}/company_list_raw_content.html'.format(os.getcwd()))
company_list_rendered_content = browser.page_source

In [None]:
# Save company list rendered html content on disk
# for developing and testing purposes
with open('company_list_rendered_content.html', 'wb') as fout:
    fout.write(company_list_rendered_content.encode('utf-8'))

In [None]:
# Get founding_blocks with company list selector
funding_blocks_sel = Selector(text=company_list_rendered_content)

In [None]:
# Extract funding blocks with company lists
funding_blocks = funding_blocks_sel.xpath(XPATH_COMPANY_FUNDING_DATE)

In [None]:
# Extract funding date from first funding block
funding_block = funding_blocks[0]
funding_date = datetime.datetime.strptime(funding_block.xpath('text()').extract_first(), '%B %d, %Y').date()
print('funding date: {}'.format(funding_date))

In [None]:
# Extract company list from first funding block with given funding date
company_list = [urlparse.urljoin(base_url, item) for item in funding_block.xpath(XPATH_COMPANY_LIST).extract()]
print('company_list:')
for company_url in company_list:
    print('\t{}'.format(company_url))

In [None]:
# Get first company crunchbase link url
company_url = company_list[0]

In [None]:
# Get company info raw html content
company_info_raw_content = requests.get(company_url, headers=headers)

In [None]:
# Save company info raw html content on disk
# for developing and testing purposes
with open('company_info_raw_content.html', 'wb') as fout:
    fout.write(company_info_raw_content.content)

In [163]:
# Get company info rendered html content
browser.get('file:///{}/company_info_raw_content.html'.format(os.getcwd()))
company_info_rendered_content = browser.page_source

In [162]:
# Save company info rendered html content on disk
# for developing and testing purposes
with open('company_info_rendered_content.html', 'wb') as fout:
    fout.write(company_info_rendered_content.encode('utf-8'))

In [164]:
# Create company info selector
company_info_sel = Selector(text=company_info_rendered_content)

In [195]:
# Extract company info
company_crunchbase_link = company_url
name = company_info_sel.xpath(XPATH_COMPANY_NAME).extract_first(),
description = company_info_sel.xpath(XPATH_COMPANY_DESCRIPTION).extract_first(),
site_link = company_info_sel.xpath(XPATH_COMPANY_SITE_LINK).extract_first(),
linkedin_link = company_info_sel.xpath(XPATH_COMPANY_LINKEDIN_LINK).extract_first()

# Company funding info
funding_round = company_info_sel.xpath(XPATH_COMPANY_FUNDING_ROUND).extract_first(),
funding_amount = company_info_sel.xpath(XPATH_COMPANY_FUNDING_AMOUNT).extract_first()

print('Company info')
print('\tname: {}'.format(name))
print('\tdescription: {}'.format(description))
print('\tcompany_crunchbase_link: {}'.format(company_crunchbase_link))
print('\tsite_link: {}'.format(site_link))
print('\tlinkedin_link: {}'.format(linkedin_link))
print('\tfunding_date: {}'.format(funding_date))
print('\tfunding_round: {}'.format(funding_round))
print('\tfunding_amount: {}'.format(funding_amount))

Company info
	name: (u'Go-Jek',)
	description: (u'The fastest courier, transport, and shopping service in Jakarta.',)
	company_crunchbase_link: https://www.crunchbase.com/organization/go-jek
	site_link: (u'http://go-jek.com',)
	linkedin_link: https://www.linkedin.com/company/pt--go-jek-indonesia
	funding_date: 2016-08-04
	funding_round: (u'Venture',)
	funding_amount: $550M / 


In [206]:
# Get company's team member info
block = company_info_sel.xpath(XPATH_TEAM_MEMBER_LIST)
team_members = zip([urlparse.urljoin(base_url, item) for item in block.xpath(XPATH_TEAM_MEMBER_CRUNCHBASE_LINK).extract()],
                   block.xpath(XPATH_TEAM_MEMBER_FULL_NAME).extract(),
                   block.xpath(XPATH_TEAM_MEMBER_POSITION).extract())
print('Team member info\n')
for team_member in team_members:
    print('full name: {}'.format(team_member[1]))
    print('position: {}'.format(team_member[2]))
    print('crunchbase_link: {}\n'.format(team_member[0]))

Team member info

full name: Kevin Aluwi
position: CFO
crunchbase_link: https://www.crunchbase.com/person/kevin-aluwi

full name: Sidu Ponnappa Chonira
position: Director
crunchbase_link: https://www.crunchbase.com/person/sidu-ponnappa-chonira

full name: Dayu Dara Permata
position: Co-Founder, Co-Head of GO-LIFE.
crunchbase_link: https://www.crunchbase.com/person/dayu-dara-permata

full name: Nadiem Makarim
position: CEO
crunchbase_link: https://www.crunchbase.com/person/nadiem-makarim

full name: Michaelangelo Moran
position: Co-Founder, Brand Director
crunchbase_link: https://www.crunchbase.com/person/michaelangelo-moran



In [None]:
# Get team member info raw html content
team_member_info_raw_content = requests.get(team_members[0], headers=headers)

In [None]:
# Save team member info raw html content on disk
# for developing and testing purposes
with open('team_member_info_raw_content.html', 'wb') as fout:
    fout.write(team_member_info_raw_content.content)

In [207]:
# Get team member info rendered html content
browser.get('file:///{}/team_member_info_raw_content.html'.format(os.getcwd()))
team_member_info_rendered_content = browser.page_source

In [None]:
# Save team member info rendered html content on disk
# for developing and testing purposes
with open('team_member_info_rendered_content.html', 'wb') as fout:
    fout.write(company_info_rendered_content.encode('utf-8'))

In [208]:
# Create team member info selector
team_member_info_sel = Selector(text=team_member_info_rendered_content)

In [209]:
# Extract team member info 
team_member_details = team_member_info_sel.xpath(XPATH_TEAM_MEMBER_PERSONAL_DETAILS).extract_first()
team_member_linkedin_link = team_member_info_sel.xpath(XPATH_TEAM_MEMBER_LINKEDIN_LINK).extract_first()
print('\tdetails: {}'.format(team_member_details))
print('\tlinkedin_link: {}'.format(team_member_linkedin_link))

	details: None
	linkedin_link: None


In [None]:
# test loading funding dates
import selenium
# browser.get('file:///{}/company_list_rendered_content.html'.format(os.getcwd()))
# content = browser.page_source
# sel = Selector(text=content)
# print(sel.xpath('//script[contains(., "set_date_titles")]/text()').extract_first())

script = """(function() {
      $.each($('.funding_rounds ul.section-list li'), function() {
      });
        var fr_date_str = $(this).data('date');

        try {
          if(fr_date_str &amp;amp;&amp;amp; $('li h2.title_date[data-date=' + fr_date_str + ']').length == 0) {
            $(this).before("&amp;lt;li&amp;gt;&amp;lt;h2 class='title_date' data-date='" + fr_date_str + "'&amp;gt;" + fr_date_str + "&amp;lt;/h2&amp;gt;&amp;lt;/li&amp;gt;");
          }
        } catch(e) {
          if(fr_date_str &amp;amp;&amp;amp; $('li h2.title_date[data-date="' + fr_date_str + '"]').length == 0) {
            $(this).before("&amp;lt;li&amp;gt;&amp;lt;h2 class='title_date' data-date='" + fr_date_str + "'&amp;gt;" + fr_date_str + "&amp;lt;/h2&amp;gt;&amp;lt;/li&amp;gt;");
          }
        }
});"""


# script = """(function(){
#     $("p").click(function(){
#         $(this).hide();
#     });
# });"""

# print(script)

elem = browser.find_element_by_xpath('//div/script[contains(., "set_date_titles")]/script')
print(elem.text)
# browser.execute_script(script)

In [258]:
# Get team member info rendered html content
response = requests.get('https://www.crunchbase.com/organization/synaptec#/entity', headers=headers)

with open('company_synaptec_raw_content.html', 'wb') as fout: 
    fout.write(response.content)
    
browser.get('file:///{}/company_synaptec_raw_content.html'.format(os.getcwd()))
company_info_rendered_content = browser.page_source

with open('company_synaptec_rendered_content.html', 'wb') as fout: 
    fout.write(company_info_rendered_content.encode('utf-8'))

company3_sel = Selector(text=company_info_rendered_content)

In [250]:
with open('{}/company_info_rendered_content.html'.format(os.getcwd())) as fin: 
    company2_sel = Selector(text=fin.read())

In [259]:
print company_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()
print company2_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()
print company3_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()

http://hypothes.is
http://go-jek.com
http://synapt.ec/


In [281]:
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company_sel.xpath('//div[@class="base info-tab people"]//div[@class="info-block"]//a/@href').extract()]
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company2_sel.xpath('//div[@class="base info-tab people"]//div[@class="info-block"]//a/@href').extract()]
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company3_sel.xpath(XPATH_TEAM_MEMBER_CRUNCHBASE_LINK).extract()]

[]
['https://www.crunchbase.com/person/kevin-aluwi#/entity', 'https://www.crunchbase.com/person/sidu-ponnappa-chonira#/entity', 'https://www.crunchbase.com/person/dayu-dara-permata#/entity', 'https://www.crunchbase.com/person/nadiem-makarim#/entity', 'https://www.crunchbase.com/person/michaelangelo-moran#/entity']
[]


In [319]:
browser.get('file:///{}/raw_content.html'.format(os.getcwd()))
rendered_content = browser.page_source

sel = Selector(text=rendered_content)
sel.xpath('//meta[contains(@content, "blocked")]').extract_first()


u'<meta http-equiv="refresh" content="10; url=/distil_r_blocked.html?Ref=/organization/hypothes&amp;distil_RID=8AC59ED0-5D23-11E6-8276-FB62304ED9BC&amp;distil_TID=20160808044951">'