In [1]:
from scrapy import Selector
from selenium import webdriver
import requests
import urlparse
import os
import re
import datetime

In [2]:
# Global variables
XPATH_COMPANY_LIST = '//div[@class="info-block"]/h4/a[contains(@href, "organization")]/@href'
XPATH_COMPANY_FUNDING_DATE = '//h2[@class="title_date"]'
XPATH_COMPANY_FUNDING_ROUND = '//table[@class="table container"]//a/text()'
XPATH_COMPANY_FUNDING_AMOUNT = '//table[@class="table container"]//td[2]/text()'
XPATH_COMPANY_CRUNCHBASE_LINK = '//div[@class="info-block"]//a/@href'
XPATH_COMPANY_SITE_LINK = '//div[@class="definition-list container"]//dd[5]/a/@href'
XPATH_COMPANY_LINKEDIN_LINK = '//dd[@class="social-links"]//a[@class="icons linkedin"]/@href'
XPATH_COMPANY_NAME = '//h1[@id="profile_header_heading"]//a/text()'
XPATH_COMPANY_DESCRIPTION = '//div[@class="definition-list container"]//dd[2]/text()'

XPATH_TEAM_MEMBER_LIST = '//div[@class="base info-tab people"]//ul[@class="section-list container"]'
XPATH_TEAM_MEMBER_FULL_NAME = '//div[@class="info-block"]/div[@class="large"]//a[@class="follow_card"]/text()'
XPATH_TEAM_MEMBER_POSITION = '//div[@class="info-block"]/div[@class="large"]/h5/text()'
XPATH_TEAM_MEMBER_CRUNCHBASE_LINK = '//h4/a[@data-type="person"][@class="follow_card"]/@href'
XPATH_TEAM_MEMBER_LINKEDIN_LINK = '//dd[@class="social-links"]/a[contains(@href, "linkedin")]/@href'
XPATH_TEAM_MEMBER_PERSONAL_DETAILS = '//div[@class="base info-tab description"]//div[@class="card-content box container card-slim"]//text()'

XPATH_CONTENT_BLOCKED = '//meta[contains(@content, "blocked")]'

base_url = 'https://www.crunchbase.com/'
url = 'https://www.crunchbase.com/funding-rounds'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12'}

In [25]:
# Create rendering engine
browser = webdriver.Chrome()

In [None]:
# Get company list raw html content
response = requests.get(url, headers=headers)

In [None]:
# Save company list raw html content on disk 
# for developing and testing purposes
with open('company_list_raw_content.html', 'wb') as fout:
    fout.write(response.content)

In [None]:
# Get company list rendered html content
browser.get('file:///{}/company_list_raw_content.html'.format(os.getcwd()))
company_list_rendered_content = browser.page_source

In [None]:
# Save company list rendered html content on disk
# for developing and testing purposes
with open('company_list_rendered_content.html', 'wb') as fout:
    fout.write(company_list_rendered_content.encode('utf-8'))

In [None]:
# Get founding_blocks with company list selector
funding_blocks_sel = Selector(text=company_list_rendered_content)

In [None]:
# Extract funding blocks with company lists
funding_blocks = funding_blocks_sel.xpath(XPATH_COMPANY_FUNDING_DATE)

In [None]:
# Extract funding date from first funding block
funding_block = funding_blocks[0]
funding_date = datetime.datetime.strptime(funding_block.xpath('text()').extract_first(), '%B %d, %Y').date()
print('funding date: {}'.format(funding_date))

In [None]:
# Extract company list from first funding block with given funding date
company_list = [urlparse.urljoin(base_url, item) for item in funding_block.xpath(XPATH_COMPANY_LIST).extract()]
print('company_list:')
for company_url in company_list:
    print('\t{}'.format(company_url))

In [None]:
# Get first company crunchbase link url
company_url = company_list[0]

In [None]:
# Get company info raw html content
company_info_raw_content = requests.get(company_url, headers=headers)

In [None]:
# Save company info raw html content on disk
# for developing and testing purposes
with open('company_info_raw_content.html', 'wb') as fout:
    fout.write(company_info_raw_content.content)

In [163]:
# Get company info rendered html content
browser.get('file:///{}/company_info_raw_content.html'.format(os.getcwd()))
company_info_rendered_content = browser.page_source

In [162]:
# Save company info rendered html content on disk
# for developing and testing purposes
with open('company_info_rendered_content.html', 'wb') as fout:
    fout.write(company_info_rendered_content.encode('utf-8'))

In [164]:
# Create company info selector
company_info_sel = Selector(text=company_info_rendered_content)

In [195]:
# Extract company info
company_crunchbase_link = company_url
name = company_info_sel.xpath(XPATH_COMPANY_NAME).extract_first(),
description = company_info_sel.xpath(XPATH_COMPANY_DESCRIPTION).extract_first(),
site_link = company_info_sel.xpath(XPATH_COMPANY_SITE_LINK).extract_first(),
linkedin_link = company_info_sel.xpath(XPATH_COMPANY_LINKEDIN_LINK).extract_first()

# Company funding info
funding_round = company_info_sel.xpath(XPATH_COMPANY_FUNDING_ROUND).extract_first(),
funding_amount = company_info_sel.xpath(XPATH_COMPANY_FUNDING_AMOUNT).extract_first()

print('Company info')
print('\tname: {}'.format(name))
print('\tdescription: {}'.format(description))
print('\tcompany_crunchbase_link: {}'.format(company_crunchbase_link))
print('\tsite_link: {}'.format(site_link))
print('\tlinkedin_link: {}'.format(linkedin_link))
print('\tfunding_date: {}'.format(funding_date))
print('\tfunding_round: {}'.format(funding_round))
print('\tfunding_amount: {}'.format(funding_amount))

Company info
	name: (u'Go-Jek',)
	description: (u'The fastest courier, transport, and shopping service in Jakarta.',)
	company_crunchbase_link: https://www.crunchbase.com/organization/go-jek
	site_link: (u'http://go-jek.com',)
	linkedin_link: https://www.linkedin.com/company/pt--go-jek-indonesia
	funding_date: 2016-08-04
	funding_round: (u'Venture',)
	funding_amount: $550M / 


In [206]:
# Get company's team member info
block = company_info_sel.xpath(XPATH_TEAM_MEMBER_LIST)
team_members = zip([urlparse.urljoin(base_url, item) for item in block.xpath(XPATH_TEAM_MEMBER_CRUNCHBASE_LINK).extract()],
                   block.xpath(XPATH_TEAM_MEMBER_FULL_NAME).extract(),
                   block.xpath(XPATH_TEAM_MEMBER_POSITION).extract())
print('Team member info\n')
for team_member in team_members:
    print('full name: {}'.format(team_member[1]))
    print('position: {}'.format(team_member[2]))
    print('crunchbase_link: {}\n'.format(team_member[0]))

Team member info

full name: Kevin Aluwi
position: CFO
crunchbase_link: https://www.crunchbase.com/person/kevin-aluwi

full name: Sidu Ponnappa Chonira
position: Director
crunchbase_link: https://www.crunchbase.com/person/sidu-ponnappa-chonira

full name: Dayu Dara Permata
position: Co-Founder, Co-Head of GO-LIFE.
crunchbase_link: https://www.crunchbase.com/person/dayu-dara-permata

full name: Nadiem Makarim
position: CEO
crunchbase_link: https://www.crunchbase.com/person/nadiem-makarim

full name: Michaelangelo Moran
position: Co-Founder, Brand Director
crunchbase_link: https://www.crunchbase.com/person/michaelangelo-moran



In [None]:
# Get team member info raw html content
team_member_info_raw_content = requests.get(team_members[0], headers=headers)

In [None]:
# Save team member info raw html content on disk
# for developing and testing purposes
with open('team_member_info_raw_content.html', 'wb') as fout:
    fout.write(team_member_info_raw_content.content)

In [207]:
# Get team member info rendered html content
browser.get('file:///{}/team_member_info_raw_content.html'.format(os.getcwd()))
team_member_info_rendered_content = browser.page_source

In [None]:
# Save team member info rendered html content on disk
# for developing and testing purposes
with open('team_member_info_rendered_content.html', 'wb') as fout:
    fout.write(company_info_rendered_content.encode('utf-8'))

In [208]:
# Create team member info selector
team_member_info_sel = Selector(text=team_member_info_rendered_content)

In [209]:
# Extract team member info 
team_member_details = team_member_info_sel.xpath(XPATH_TEAM_MEMBER_PERSONAL_DETAILS).extract_first()
team_member_linkedin_link = team_member_info_sel.xpath(XPATH_TEAM_MEMBER_LINKEDIN_LINK).extract_first()
print('\tdetails: {}'.format(team_member_details))
print('\tlinkedin_link: {}'.format(team_member_linkedin_link))

	details: None
	linkedin_link: None


In [41]:
# test loading funding dates
import selenium
# browser.get('file:///{}/company_list_rendered_content.html'.format(os.getcwd()))
# content = browser.page_source
# sel = Selector(text=content)
# print(sel.xpath('//script[contains(., "set_date_titles")]/text()').extract_first())

script = """(function() {
  $(function() {
    function set_date_titles() {
      $.each($('.funding_rounds ul.section-list li'), function() {
        var fr_date_str = $(this).data('date');

        try {
          if(fr_date_str && $('li h2.title_date[data-date=' + fr_date_str + ']').length == 0) {
            $(this).before("<li><h2 class='title_date' data-date='" + fr_date_str + "'>" + fr_date_str + "</h2></li>");
          }
        } catch(e) {
          if(fr_date_str && $('li h2.title_date[data-date="' + fr_date_str + '"]').length == 0) {
            $(this).before("<li><h2 class='title_date' data-date='" + fr_date_str + "'>" + fr_date_str + "</h2></li>");
          }
        }
      });
    }

    set_date_titles();

    CB.infiniteScroll(set_date_titles);
  });
})(jQuery);"""

script_2 = """var CB=CB||{};CB.infiniteScroll=function(e,t){function n(){if(c=window.location.toString().indexOf("?")>-1?"&":"?",keys=[],h.keyset){for(var e in h.keyset)if(h.keyset.hasOwnProperty(e)){var t=h.keyset[e].val();keys.push(e+"="+t)}}else keys.push("page=");return c+=keys.join("&"),h.query=c,h.query}function i(){return h.keyset?h.pathName+h.query:h.pathName+h.query+h.number}function o(){return h.number++}function r(){$(".info-tab .card-content").append('<div class="loader infinite-scroll"><span class="loader-text"></span>Loading....</div>').trigger("append")}function s(){$(".loader").remove()}function l(e){if(h.finished!==!0){var t=CB.responsive.el,n=t.$headerHeight(),i=t.$documentHeight(),o=t.$window(),r=t.$windowHeight(),s=h.currentScroll=$(this).scrollTop();if(o.scrollTop()+r>=i-n&&h.requestMade===!1)return a(e),h.requestMade=!0;if(!(s>=h.windowHeight/2&&h.requestMade===!0))return 20===$(".first-column .section-list "+d).length&&o.scrollTop()+r>=i-n&&h.requestMade===!1?(a(e),h.requestMade=!0):void 0}}function a(e){o(),r(),n(),$.ajax({type:"GET",url:i(),dataType:"json"}).done(function(n){if(null!=t.handlebars_template)return 0==n.data.length?(s(),h.finished=!0,!0):(h.element.append(Handlebars.templates[t.handlebars_template](n)),s(),"undefined"!=typeof e&&e&&e(),$(".follow_card").follow_card(),h.requestMade=!1);var i=$(n.template),o=i.find(d);return 0===o.length?(s(),"undefined"!=typeof e&&e&&e(),h.finished=!0,!0):(s(),h.element.append(o),"undefined"!=typeof e&&e&&e(),$(".follow_card").follow_card(),h.requestMade=!1)})}var t="undefined"!=typeof t?t:{},d="undefined"!=typeof t.listElem?t.listElem:"li",u=window.location;null!=t.handlebars_template&&(u=window.location+"/content");var f=t.keyset,c="",h={numLists:$(".first-column .section-list").length,firstListSize:$(".first-column .section-list:first").find(d).length,element:$(".first-column .section-list:last"),lis:$(".first-column .section-list").find(d),pathName:u,query:c,number:1,windowHeight:$(document).height(),currentScroll:$(window).scrollTop(),shouldMakeRequest:!1,requestLength:20,initHeight:$(".first-column .card-content").height(),finished:!1,requestMade:!1,keyset:f};h.numLists>1&&h.firstListSize+h.lis.length<h.requestLength||$(document).ready(function(){$("#main-content").outerHeight()<$(window).outerHeight()&&l(e),$(window).scroll(function(){l(e)})})};"""

script_3 = """(window.NREUM||(NREUM={})).loader_config={xpid:"Ug4FUEVSCAACV1lRBg=="};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var o,i=t("ee"),a=t(15),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(s.console=!0,o.indexOf("dev")!==-1&&(s.dev=!0),o.indexOf("nr_dev")!==-1&&(s.nrDev=!0))}catch(c){}s.nrDev&&i.on("internal-error",function(t){r(t.stack)}),s.dev&&i.on("fn-err",function(t,e,n){r(n.stack)}),s.dev&&(r("NR AGENT IN DEVELOPMENT MODE"),r("flags: "+a(s,function(t,e){return t}).join(", ")))},{}],2:[function(t,e,n){function r(t,e,n,r,o){try{d?d-=1:i("err",[o||new UncaughtException(t,e,n)])}catch(s){try{i("ierr",[s,(new Date).getTime(),!0])}catch(c){}}return"function"==typeof f&&f.apply(this,a(arguments))}function UncaughtException(t,e,n){this.message=t||"Uncaught error with no additional information",this.sourceURL=e,this.line=n}function o(t){i("err",[t,(new Date).getTime()])}var i=t("handle"),a=t(16),s=t("ee"),c=t("loader"),f=window.onerror,u=!1,d=0;c.features.err=!0,t(1),window.onerror=r;try{throw new Error}catch(l){"stack"in l&&(t(8),t(7),"addEventListener"in window&&t(5),c.xhrWrappable&&t(9),u=!0)}s.on("fn-start",function(t,e,n){u&&(d+=1)}),s.on("fn-err",function(t,e,n){u&&(this.thrown=!0,o(n))}),s.on("fn-end",function(){u&&!this.thrown&&d>0&&(d-=1)}),s.on("internal-error",function(t){i("ierr",[t,(new Date).getTime(),!0])})},{}],3:[function(t,e,n){t("loader").features.ins=!0},{}],4:[function(t,e,n){function r(t){}if(window.performance&&window.performance.timing&&window.performance.getEntriesByType){var o=t("ee"),i=t("handle"),a=t(8),s=t(7),c="learResourceTimings",f="addEventListener",u="resourcetimingbufferfull",d="bstResource",l="resource",p="-start",h="-end",m="fn"+p,w="fn"+h,v="bstTimer",y="pushState";t("loader").features.stn=!0,t(6);var g=NREUM.o.EV;o.on(m,function(t,e){var n=t[0];n instanceof g&&(this.bstStart=Date.now())}),o.on(w,function(t,e){var n=t[0];n instanceof g&&i("bst",[n,e,this.bstStart,Date.now()])}),a.on(m,function(t,e,n){this.bstStart=Date.now(),this.bstType=n}),a.on(w,function(t,e){i(v,[e,this.bstStart,Date.now(),this.bstType])}),s.on(m,function(){this.bstStart=Date.now()}),s.on(w,function(t,e){i(v,[e,this.bstStart,Date.now(),"requestAnimationFrame"])}),o.on(y+p,function(t){this.time=Date.now(),this.startPath=location.pathname+location.hash}),o.on(y+h,function(t){i("bstHist",[location.pathname+location.hash,this.startPath,this.time])}),f in window.performance&&(window.performance["c"+c]?window.performance[f](u,function(t){i(d,[window.performance.getEntriesByType(l)]),window.performance["c"+c]()},!1):window.performance[f]("webkit"+u,function(t){i(d,[window.performance.getEntriesByType(l)]),window.performance["webkitC"+c]()},!1)),document[f]("scroll",r,!1),document[f]("keypress",r,!1),document[f]("click",r,!1)}},{}],5:[function(t,e,n){function r(t){for(var e=t;e&&!e.hasOwnProperty(u);)e=Object.getPrototypeOf(e);e&&o(e)}function o(t){s.inPlace(t,[u,d],"-",i)}function i(t,e){return t[1]}var a=t("ee").get("events"),s=t(17)(a),c=t("gos"),f=XMLHttpRequest,u="addEventListener",d="removeEventListener";e.exports=a,"getPrototypeOf"in Object?(r(document),r(window),r(f.prototype)):f.prototype.hasOwnProperty(u)&&(o(window),o(f.prototype)),a.on(u+"-start",function(t,e){if(t[1]){var n=t[1];if("function"==typeof n){var r=c(n,"nr@wrapped",function(){return s(n,"fn-",null,n.name||"anonymous")});this.wrapped=t[1]=r}else"function"==typeof n.handleEvent&&s.inPlace(n,["handleEvent"],"fn-")}}),a.on(d+"-start",function(t){var e=this.wrapped;e&&(t[1]=e)})},{}],6:[function(t,e,n){var r=t("ee").get("history"),o=t(17)(r);e.exports=r,o.inPlace(window.history,["pushState","replaceState"],"-")},{}],7:[function(t,e,n){var r=t("ee").get("raf"),o=t(17)(r),i="equestAnimationFrame";e.exports=r,o.inPlace(window,["r"+i,"mozR"+i,"webkitR"+i,"msR"+i],"raf-"),r.on("raf-start",function(t){t[0]=o(t[0],"fn-")})},{}],8:[function(t,e,n){function r(t,e,n){t[0]=a(t[0],"fn-",null,n)}function o(t,e,n){this.method=n,this.timerDuration="number"==typeof t[1]?t[1]:0,t[0]=a(t[0],"fn-",this,n)}var i=t("ee").get("timer"),a=t(17)(i),s="setTimeout",c="setInterval",f="clearTimeout",u="-start",d="-";e.exports=i,a.inPlace(window,[s,"setImmediate"],s+d),a.inPlace(window,[c],c+d),a.inPlace(window,[f,"clearImmediate"],f+d),i.on(c+u,r),i.on(s+u,o)},{}],9:[function(t,e,n){function r(t,e){d.inPlace(e,["onreadystatechange"],"fn-",s)}function o(){var t=this,e=u.context(t);t.readyState>3&&!e.resolved&&(e.resolved=!0,u.emit("xhr-resolved",[],t)),d.inPlace(t,w,"fn-",s)}function i(t){v.push(t),h&&(g=-g,b.data=g)}function a(){for(var t=0;t<v.length;t++)r([],v[t]);v.length&&(v=[])}function s(t,e){return e}function c(t,e){for(var n in t)e[n]=t[n];return e}t(5);var f=t("ee"),u=f.get("xhr"),d=t(17)(u),l=NREUM.o,p=l.XHR,h=l.MO,m="readystatechange",w=["onload","onerror","onabort","onloadstart","onloadend","onprogress","ontimeout"],v=[];e.exports=u;var y=window.XMLHttpRequest=function(t){var e=new p(t);try{u.emit("new-xhr",[e],e),e.addEventListener(m,o,!1)}catch(n){try{u.emit("internal-error",[n])}catch(r){}}return e};if(c(p,y),y.prototype=p.prototype,d.inPlace(y.prototype,["open","send"],"-xhr-",s),u.on("send-xhr-start",function(t,e){r(t,e),i(e)}),u.on("open-xhr-start",r),h){var g=1,b=document.createTextNode(g);new h(a).observe(b,{characterData:!0})}else f.on("fn-end",function(t){t[0]&&t[0].type===m||a()})},{}],10:[function(t,e,n){function r(t){var e=this.params,n=this.metrics;if(!this.ended){this.ended=!0;for(var r=0;r<d;r++)t.removeEventListener(u[r],this.listener,!1);if(!e.aborted){if(n.duration=(new Date).getTime()-this.startTime,4===t.readyState){e.status=t.status;var i=o(t,this.lastSize);if(i&&(n.rxSize=i),this.sameOrigin){var a=t.getResponseHeader("X-NewRelic-App-Data");a&&(e.cat=a.split(", ").pop())}}else e.status=0;n.cbTime=this.cbTime,f.emit("xhr-done",[t],t),s("xhr",[e,n,this.startTime])}}}function o(t,e){var n=t.responseType;if("json"===n&&null!==e)return e;var r="arraybuffer"===n||"blob"===n||"json"===n?t.response:t.responseText;return h(r)}function i(t,e){var n=c(e),r=t.params;r.host=n.hostname+":"+n.port,r.pathname=n.pathname,t.sameOrigin=n.sameOrigin}var a=t("loader");if(a.xhrWrappable){var s=t("handle"),c=t(11),f=t("ee"),u=["load","error","abort","timeout"],d=u.length,l=t("id"),p=t(14),h=t(13),m=window.XMLHttpRequest;a.features.xhr=!0,t(9),f.on("new-xhr",function(t){var e=this;e.totalCbs=0,e.called=0,e.cbTime=0,e.end=r,e.ended=!1,e.xhrGuids={},e.lastSize=null,p&&(p>34||p<10)||window.opera||t.addEventListener("progress",function(t){e.lastSize=t.loaded},!1)}),f.on("open-xhr-start",function(t){this.params={method:t[0]},i(this,t[1]),this.metrics={}}),f.on("open-xhr-end",function(t,e){"loader_config"in NREUM&&"xpid"in NREUM.loader_config&&this.sameOrigin&&e.setRequestHeader("X-NewRelic-ID",NREUM.loader_config.xpid)}),f.on("send-xhr-start",function(t,e){var n=this.metrics,r=t[0],o=this;if(n&&r){var i=h(r);i&&(n.txSize=i)}this.startTime=(new Date).getTime(),this.listener=function(t){try{"abort"===t.type&&(o.params.aborted=!0),("load"!==t.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof e.onload))&&o.end(e)}catch(n){try{f.emit("internal-error",[n])}catch(r){}}};for(var a=0;a<d;a++)e.addEventListener(u[a],this.listener,!1)}),f.on("xhr-cb-time",function(t,e,n){this.cbTime+=t,e?this.onloadCalled=!0:this.called+=1,this.called!==this.totalCbs||!this.onloadCalled&&"function"==typeof n.onload||this.end(n)}),f.on("xhr-load-added",function(t,e){var n=""+l(t)+!!e;this.xhrGuids&&!this.xhrGuids[n]&&(this.xhrGuids[n]=!0,this.totalCbs+=1)}),f.on("xhr-load-removed",function(t,e){var n=""+l(t)+!!e;this.xhrGuids&&this.xhrGuids[n]&&(delete this.xhrGuids[n],this.totalCbs-=1)}),f.on("addEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-added",[t[1],t[2]],e)}),f.on("removeEventListener-end",function(t,e){e instanceof m&&"load"===t[0]&&f.emit("xhr-load-removed",[t[1],t[2]],e)}),f.on("fn-start",function(t,e,n){e instanceof m&&("onload"===n&&(this.onload=!0),("load"===(t[0]&&t[0].type)||this.onload)&&(this.xhrCbStart=(new Date).getTime()))}),f.on("fn-end",function(t,e){this.xhrCbStart&&f.emit("xhr-cb-time",[(new Date).getTime()-this.xhrCbStart,this.onload,e],e)})}},{}],11:[function(t,e,n){e.exports=function(t){var e=document.createElement("a"),n=window.location,r={};e.href=t,r.port=e.port;var o=e.href.split("://");!r.port&&o[1]&&(r.port=o[1].split("/")[0].split("@").pop().split(":")[1]),r.port&&"0"!==r.port||(r.port="https"===o[0]?"443":"80"),r.hostname=e.hostname||n.hostname,r.pathname=e.pathname,r.protocol=o[0],"/"!==r.pathname.charAt(0)&&(r.pathname="/"+r.pathname);var i=!e.protocol||":"===e.protocol||e.protocol===n.protocol,a=e.hostname===document.domain&&e.port===n.port;return r.sameOrigin=i&&(!e.hostname||a),r}},{}],12:[function(t,e,n){function r(){}function o(t,e,n){return function(){return i(t,[(new Date).getTime()].concat(s(arguments)),e?null:this,n),e?void 0:this}}var i=t("handle"),a=t(15),s=t(16),c=t("ee").get("tracer"),f=NREUM;"undefined"==typeof window.newrelic&&(newrelic=f);var u=["setPageViewName","setCustomAttribute","finished","addToTrace","inlineHit"],d="api-",l=d+"ixn-";a(u,function(t,e){f[e]=o(d+e,!0,"api")}),f.addPageAction=o(d+"addPageAction",!0),e.exports=newrelic,f.interaction=function(){return(new r).get()};var p=r.prototype={createTracer:function(t,e){var n={},r=this,o="function"==typeof e;return i(l+"tracer",[Date.now(),t,n],r),function(){if(c.emit((o?"":"no-")+"fn-start",[Date.now(),r,o],n),o)try{return e.apply(this,arguments)}finally{c.emit("fn-end",[Date.now()],n)}}}};a("setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(t,e){p[e]=o(l+e)}),newrelic.noticeError=function(t){"string"==typeof t&&(t=new Error(t)),i("err",[t,(new Date).getTime()])}},{}],13:[function(t,e,n){e.exports=function(t){if("string"==typeof t&&t.length)return t.length;if("object"==typeof t){if("undefined"!=typeof ArrayBuffer&&t instanceof ArrayBuffer&&t.byteLength)return t.byteLength;if("undefined"!=typeof Blob&&t instanceof Blob&&t.size)return t.size;if(!("undefined"!=typeof FormData&&t instanceof FormData))try{return JSON.stringify(t).length}catch(e){return}}}},{}],14:[function(t,e,n){var r=0,o=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);o&&(r=+o[1]),e.exports=r},{}],15:[function(t,e,n){function r(t,e){var n=[],r="",i=0;for(r in t)o.call(t,r)&&(n[i]=e(r,t[r]),i+=1);return n}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],16:[function(t,e,n){function r(t,e,n){e||(e=0),"undefined"==typeof n&&(n=t?t.length:0);for(var r=-1,o=n-e||0,i=Array(o<0?0:o);++r<o;)i[r]=t[e+r];return i}e.exports=r},{}],17:[function(t,e,n){function r(t){return!(t&&"function"==typeof t&&t.apply&&!t[a])}var o=t("ee"),i=t(16),a="nr@original",s=Object.prototype.hasOwnProperty,c=!1;e.exports=function(t){function e(t,e,n,o){function nrWrapper(){var r,a,s,c;try{a=this,r=i(arguments),s="function"==typeof n?n(r,a):n||{}}catch(u){d([u,"",[r,a,o],s])}f(e+"start",[r,a,o],s);try{return c=t.apply(a,r)}catch(l){throw f(e+"err",[r,a,l],s),l}finally{f(e+"end",[r,a,c],s)}}return r(t)?t:(e||(e=""),nrWrapper[a]=t,u(t,nrWrapper),nrWrapper)}function n(t,n,o,i){o||(o="");var a,s,c,f="-"===o.charAt(0);for(c=0;c<n.length;c++)s=n[c],a=t[s],r(a)||(t[s]=e(a,f?s+o:o,i,s))}function f(e,n,r){if(!c){c=!0;try{t.emit(e,n,r)}catch(o){d([o,e,n,r])}c=!1}}function u(t,e){if(Object.defineProperty&&Object.keys)try{var n=Object.keys(t);return n.forEach(function(n){Object.defineProperty(e,n,{get:function(){return t[n]},set:function(e){return t[n]=e,e}})}),e}catch(r){d([r])}for(var o in t)s.call(t,o)&&(e[o]=t[o]);return e}function d(e){try{t.emit("internal-error",e)}catch(n){}}return t||(t=o),e.inPlace=n,e.flag=a,e}},{}],ee:[function(t,e,n){function r(){}function o(t){function e(t){return t&&t instanceof r?t:t?s(t,a,i):i()}function n(n,r,o){t&&t(n,r,o);for(var i=e(o),a=l(n),s=a.length,c=0;c<s;c++)a[c].apply(i,r);var u=f[w[n]];return u&&u.push([v,n,r,i]),i}function d(t,e){m[t]=l(t).concat(e)}function l(t){return m[t]||[]}function p(t){return u[t]=u[t]||o(n)}function h(t,e){c(t,function(t,n){e=e||"feature",w[n]=e,e in f||(f[e]=[])})}var m={},w={},v={on:d,emit:n,get:p,listeners:l,context:e,buffer:h};return v}function i(){return new r}var a="nr@context",s=t("gos"),c=t(15),f={},u={},d=e.exports=o();d.backlog=f},{}],gos:[function(t,e,n){function r(t,e,n){if(o.call(t,e))return t[e];var r=n();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(t,e,{value:r,writable:!0,enumerable:!1}),r}catch(i){}return t[e]=r,r}var o=Object.prototype.hasOwnProperty;e.exports=r},{}],handle:[function(t,e,n){function r(t,e,n,r){o.buffer([t],r),o.emit(t,e,n)}var o=t("ee").get("handle");e.exports=r,r.ee=o},{}],id:[function(t,e,n){function r(t){var e=typeof t;return!t||"object"!==e&&"function"!==e?-1:t===window?0:a(t,i,function(){return o++})}var o=1,i="nr@id",a=t("gos");e.exports=r},{}],loader:[function(t,e,n){function r(){if(!g++){var t=y.info=NREUM.info,e=u.getElementsByTagName("script")[0];if(t&&t.licenseKey&&t.applicationID&&e){c(w,function(e,n){t[e]||(t[e]=n)});var n="https"===m.split(":")[0]||t.sslForHttp;y.proto=n?"https://":"http://",s("mark",["onload",a()],null,"api");var r=u.createElement("script");r.src=y.proto+t.agent,e.parentNode.insertBefore(r,e)}}}function o(){"complete"===u.readyState&&i()}function i(){s("mark",["domContent",a()],null,"api")}function a(){return(new Date).getTime()}var s=t("handle"),c=t(15),f=window,u=f.document,d="addEventListener",l="attachEvent",p=f.XMLHttpRequest,h=p&&p.prototype;NREUM.o={ST:setTimeout,CT:clearTimeout,XHR:p,REQ:f.Request,EV:f.Event,PR:f.Promise,MO:f.MutationObserver},t(12);var m=""+location,w={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",agent:"js-agent.newrelic.com/nr-963.min.js"},v=p&&h&&h[d]&&!/CriOS/.test(navigator.userAgent),y=e.exports={offset:a(),origin:m,features:{},xhrWrappable:v};u[d]?(u[d]("DOMContentLoaded",i,!1),f[d]("load",r,!1)):(u[l]("onreadystatechange",o),f[l]("onload",r)),s("mark",["firstbyte",a()],null,"api");var g=0},{}]},{},["loader",2,10,4,3]);
"""

# script = """(function(){
#     $("p").click(function(){
#         $(this).hide();
#     });
# });"""

# print(script)

# elem = browser.find_element_by_xpath('//div/script[contains(., "set_date_titles")]/script')
# print(elem.text)
# help(browser.execute_async_script)     
browser.set_script_timeout(10)
print(browser.execute_async_script(script_3))

TimeoutException: Message: asynchronous script timeout: result was not received in 10 seconds
  (Session info: chrome=52.0.2743.82)
  (Driver info: chromedriver=2.20.353124 (035346203162d32c80f1dce587c8154a1efa0c3b),platform=Linux 4.4.0-34-generic x86_64)


In [258]:
# Get team member info rendered html content
response = requests.get('https://www.crunchbase.com/organization/synaptec#/entity', headers=headers)

with open('company_synaptec_raw_content.html', 'wb') as fout: 
    fout.write(response.content)
    
browser.get('file:///{}/company_synaptec_raw_content.html'.format(os.getcwd()))
company_info_rendered_content = browser.page_source

with open('company_synaptec_rendered_content.html', 'wb') as fout: 
    fout.write(company_info_rendered_content.encode('utf-8'))

company3_sel = Selector(text=company_info_rendered_content)

In [250]:
with open('{}/company_info_rendered_content.html'.format(os.getcwd())) as fin: 
    company2_sel = Selector(text=fin.read())

In [259]:
print company_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()
print company2_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()
print company3_sel.xpath('//div[@class="definition-list container"]//dd//a[@target="_blank"]/@href').extract_first()

http://hypothes.is
http://go-jek.com
http://synapt.ec/


In [281]:
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company_sel.xpath('//div[@class="base info-tab people"]//div[@class="info-block"]//a/@href').extract()]
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company2_sel.xpath('//div[@class="base info-tab people"]//div[@class="info-block"]//a/@href').extract()]
print [urlparse.urljoin(base_url, '{}{}'.format(item,'#/entity')) for item in company3_sel.xpath(XPATH_TEAM_MEMBER_CRUNCHBASE_LINK).extract()]

[]
['https://www.crunchbase.com/person/kevin-aluwi#/entity', 'https://www.crunchbase.com/person/sidu-ponnappa-chonira#/entity', 'https://www.crunchbase.com/person/dayu-dara-permata#/entity', 'https://www.crunchbase.com/person/nadiem-makarim#/entity', 'https://www.crunchbase.com/person/michaelangelo-moran#/entity']
[]


In [319]:
browser.get('file:///{}/raw_content.html'.format(os.getcwd()))
rendered_content = browser.page_source

sel = Selector(text=rendered_content)
sel.xpath('//meta[contains(@content, "blocked")]').extract_first()


u'<meta http-equiv="refresh" content="10; url=/distil_r_blocked.html?Ref=/organization/hypothes&amp;distil_RID=8AC59ED0-5D23-11E6-8276-FB62304ED9BC&amp;distil_TID=20160808044951">'

In [23]:
# Get company list raw html content
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Encoding': 'gzip, deflate, sdch, br',
           'Accept-Language': 'en-US,en;q=0.8,ru;q=0.6',
           'Connection': 'keep-alive',
           'Host': 'www.crunchbase.com',
           'Referer': 'https://www.crunchbase.com/funding-rounds',
           'Upgrade-Insecure-Requests': 1,
           'Cache-Control': 'max-age=0'}

# response = requests.get('https://www.crunchbase.com/funding-rounds', headers=headers)
print(response.headers)
print(response.cookies)

# sel = Selector(text=response.content)
# sel.xpath(XPATH_CONTENT_BLOCKED).extract_first()

{'Status': '200 OK', 'X-Request-Id': 'bd9cfe70-fc6a-443d-96cd-865e90d0c44d', 'X-RUN-ESI': 'true', 'X-Content-Type-Options': 'nosniff', 'X-Cache': 'HIT - 646', 'Content-Encoding': 'gzip', 'Transfer-Encoding': 'chunked', 'Strict-Transport-Security': 'max-age=31536000', 'Vary': 'Accept-Encoding, Accept-Encoding, X-Requested-With', 'X-Runtime': '2.639960', 'Server': 'nginx', 'Connection': 'keep-alive', 'X-XSS-Protection': '1; mode=block', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache, private, max-age=0, must-revalidate', 'Date': 'Tue, 09 Aug 2016 10:24:12 GMT', 'X-Frame-Options': 'SAMEORIGIN', 'Content-Type': 'text/html; charset=utf-8'}
<RequestsCookieJar[]>


In [24]:
cookie_1 = 'D_SID=195.160.235.253:2J+H1seWJcNc9ROjDZyEGqg5cDQ0o1BfTwgyXPiiXro; __qca=P0-45756871-1469613379559; s_sq=%5B%5BB%5D%5D; multivariate_bot=false; _ga=GA1.2.234616352.1470730370; _gat=1; _gat_newTracker=1; _hp2_props.973801186=%7B%22Logged%20In%22%3A%22false%22%7D; D_PID=16ADDCBE-394F-36B1-861B-8543C7875060; D_IID=1DEE125F-B0A1-3027-80A3-5F4204F4CD1C; D_UID=696BA689-1260-37B8-B384-EFBAF0E45681; D_HID=eMLkvZu/3MLUzP5eaVEehpA8gddSEOyiBThUnHhvJfg; _hp2_id.973801186=5137505279108583.8036308288180402.7282235641451016; _hp2_ses_props.973801186=%7B%22r%22%3A%22https%3A%2F%2Fwww.crunchbase.com%2Ffunding-rounds%22%2C%22ts%22%3A1470730371012%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2Ffunding-rounds%22%7D; AMCV_6B25357E519160E40A490D44%40AdobeOrg=1256414278%7CMCMID%7C47594697863871500846797941151795451047%7CMCAAMLH-1471335173%7C6%7CMCAAMB-1471335173%7CNRX38WO0n5BH8Th-nqAG_A%7CMCAID%7CNONE; s_pers=%20s_getnr%3D1470730373795-New%7C1533802373795%3B%20s_nrgvo%3DNew%7C1533802373828%3B; s_cc=true; __uvt=; uvts=4tMFxhRs2z7Lr6Rl; _px=eyJ0IjoxNDcwNzMwNzQ0OTU5LCJzIjp7ImEiOjAsImIiOjB9LCJoIjoiODgxOTM5ODBjNTc4ZmI0MmVkMTk5ZTg1MzI5ZTVlZTgwMjQ5MGM1MjM0YTliMzUxMzNlMjUyZjU5YTAxODNlNCJ9'.split('; ')
# print(cookie_1)
d_1 = dict(item.split('=') for item in cookie_1)

import json
print(json.dumps(d_1))

cookie_2 = 'D_SID=195.160.235.253:2J+H1seWJcNc9ROjDZyEGqg5cDQ0o1BfTwgyXPiiXro; __qca=P0-45756871-1469613379559'.split('; ')
d_2 = dict(item.split('=') for item in cookie_2)

print(json.dumps(d_2))
print(d_1['D_SID'])
print(d_2['D_SID'])

{"_px": "eyJ0IjoxNDcwNzMwNzQ0OTU5LCJzIjp7ImEiOjAsImIiOjB9LCJoIjoiODgxOTM5ODBjNTc4ZmI0MmVkMTk5ZTg1MzI5ZTVlZTgwMjQ5MGM1MjM0YTliMzUxMzNlMjUyZjU5YTAxODNlNCJ9", "D_SID": "195.160.235.253:2J+H1seWJcNc9ROjDZyEGqg5cDQ0o1BfTwgyXPiiXro", "__qca": "P0-45756871-1469613379559", "D_UID": "696BA689-1260-37B8-B384-EFBAF0E45681", "_gat_newTracker": "1", "D_IID": "1DEE125F-B0A1-3027-80A3-5F4204F4CD1C", "D_PID": "16ADDCBE-394F-36B1-861B-8543C7875060", "_ga": "GA1.2.234616352.1470730370", "uvts": "4tMFxhRs2z7Lr6Rl", "D_HID": "eMLkvZu/3MLUzP5eaVEehpA8gddSEOyiBThUnHhvJfg", "_hp2_props.973801186": "%7B%22Logged%20In%22%3A%22false%22%7D", "_hp2_ses_props.973801186": "%7B%22r%22%3A%22https%3A%2F%2Fwww.crunchbase.com%2Ffunding-rounds%22%2C%22ts%22%3A1470730371012%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2Ffunding-rounds%22%7D", "s_cc": "true", "s_sq": "%5B%5BB%5D%5D", "multivariate_bot": "false", "s_pers": "%20s_getnr%3D1470730373795-New%7C1533802373795%3B%20s_nrgvo%3DNew%7C1533802373828%3B", "_hp2_

In [27]:
help(browser.get)
# rendered_content = browser.page_source

Help on method get in module selenium.webdriver.remote.webdriver:

get(self, url) method of selenium.webdriver.chrome.webdriver.WebDriver instance
    Loads a web page in the current browser session.

