Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
186 lines (171 sloc) 11.3 KB
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
##Load modules
import requests
from bs4 import BeautifulSoup
import urllib
import re
import pandas as pd
import traceback
import scraperwiki
import time
started_scrape = False
while True:
try:
if started_scrape == False:
print "Loading companies to scrape..."
rescrape_companies = pd.read_csv("companies_only.csv")
s = requests.session()
##Make initial request for search page - need to work out how to extract "global session cookie" currently inputting manually
headers = {
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
}
r1 = s.get('https://www.ipa.gov.pg/pngmaster/service/create.html?targetAppCode=pngmaster&targetRegisterAppCode=pngcompanies&service=registerItemSearch&target=pngmaster', headers=headers)
url = r1.url
session = re.search(".*id=(.*)&.*",url)
session = session.group(1)
timestamp = re.search(".*timestamp=(.*)$",url)
timestamp = timestamp.group(1)
##Boot up homepage and retrieve node for search and session ID
search_page_content = BeautifulSoup(r1.text,'html.parser')
##Boot up homepage and retrieve node for search and session ID
script = search_page_content.find_all('script')[7].contents[0]
sessionID = re.search(r'.*sessionId:\'(.*)\',sessionIdHeaderName.*',script).group(1)
soup = BeautifulSoup(r1.text, 'html.parser')
CBNODE = search_page_content.findAll("a", { "class" : "appButton registerItemSearch-tabs-criteriaAndButtons-buttonPad-search appSearchButton appPrimaryButton appButtonSecondary appSubmitButton appNotReadOnly appIndex2" })[0]['id'].replace('node','')
CBHTMLFRAGNODEID = search_page_content.findAll("div", { "class" : "appRecord appBlock appRecordGraphIndexNodes appRecordNone registerItemSearch pageHelp appNotReadOnly appChildCount2"})[0]['id'].replace('node','')
VIKEY = re.search(r'.*viewInstanceKey:\'(.*)\',callbackNodeParam.*',r1.text).group(1)
CBHTMLFRAGID = timestamp
ADVANCEDNODE = soup.findAll("div", { "class" : "registerItemSearch-tabs-criteriaAndButtons-criteria-AdvancedSearchCriteria appAttribute Advanced Attribute-Advanced appDc-registerItemSearch.tabs.criteriaAndButtons.criteria.AdvancedSearchCriteria appNonBlankAttribute expand appNotReadOnly appIndex2 appRestrictedAtt appRestrictedAttExpand" })[0]['id'].replace('node','')
index = 0
while index <= rescrape_companies[-1:].index.tolist()[0]:
try:
for index, row_values in rescrape_companies.iterrows():
print row_values['Entity Number']
print "Working..."
##Load page to search, removed cookies from
headers = {
'Origin': 'https://www.ipa.gov.pg',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
'x-catalyst-session-global': '%s' % (sessionID,),
'Connection': 'keep-alive',
'x-catalyst-async': 'true',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-security-token': 'null',
'Accept': 'text/html, */*; q=0.01',
'Referer': 'https://www.ipa.gov.pg/pngmaster/viewInstance/view.html?id=%s' % (session,),
'x-catalyst-secured': 'true',
}
SEARCH_STRING = urllib.quote(row_values['Search term'])
data = '_CBHTMLFRAGID_=%s&_CBHTMLFRAG_=true&_CBNODE_=%s&_VIKEY_=%s&_CBNAME_=buttonPush&_CBASYNCUPDATE_=true&_CBHTMLFRAGNODEID_=%s&QueryString=%s&SourceAppCode=pngcompanies&node%s-Advanced=N' \
% (CBHTMLFRAGID,CBNODE,VIKEY,CBHTMLFRAGNODEID,SEARCH_STRING,ADVANCEDNODE)
r2 = requests.post('https://www.ipa.gov.pg/pngmaster/viewInstance/update.html?id=%s' % (session), headers=headers,cookies=r1.cookies,data=data)
##Request the company page using node W value from above
cookies = {
'x-catalyst-session-global': '%s' % (sessionID),
'x-catalyst-locale': 'en',
'x-catalyst-timezone': 'Pacific/Port_Moresby',
}
headers = {
'Origin': 'https://www.ipa.gov.pg',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
results_page = BeautifulSoup(r2.text,'html.parser')
company_id = results_page.findAll("a", { "class" : "registerItemSearch-results-page-line-ItemBox-resultLeft-viewMenu appMenu appMenuItem appMenuDepth0 noSave appItemSearchResult viewInstanceUpdateStackPush appReadOnly appIndex0" })[0]['id'].replace('node','')
data = 'QueryString=%s&SourceAppCode=pngcompanies&node%s-Advanced=N&_scrollTop=118&_CBNODE_=%s&_CBNAME_=invokeMenuCb&_CBVALUE_=&_VIKEY_=%s' \
% (SEARCH_STRING,ADVANCEDNODE,company_id,VIKEY)
r3 = requests.post('https://www.ipa.gov.pg/pngmaster/viewInstance/update.html?id=%s'% (session,), headers=headers, cookies=cookies, data=data)
for tab in [2,3,5]:
cookies = {
'x-catalyst-session-global': '%s' % (sessionID),
'x-catalyst-locale': 'en',
'x-catalyst-timezone': 'Pacific/Port_Moresby',
}
headers = {
'Origin': 'https://www.ipa.gov.pg',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8',
'X-Requested-With': 'XMLHttpRequest',
'x-catalyst-session-global': '%s' % (sessionID),
'Connection': 'keep-alive',
'x-catalyst-async': 'true',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-security-token': 'null',
'Accept': 'text/html, */*; q=0.01',
'x-catalyst-secured': 'true',
}
company_page = BeautifulSoup(r3.text,'html.parser')
tabul = company_page.findAll("ul", {"class": "appTabs"})[0]["id"].replace("tabul","")
VIKEY2 = re.search(r'.*viewInstanceKey:\'(.*)\',callbackNodeParam.*',r3.text).group(1)
posturl = company_page.findAll("form")[1]["action"]
tab = tab
data = '_CBHTMLFRAGID_=%s&_CBHTMLFRAG_=true&_CBNODE_=%s&_VIKEY_=%s&_CBNAME_=tabSelect&_CBVALUE_=%s&_CBASYNCUPDATE_=true' % (timestamp,tabul,VIKEY2,tab)
r4 = requests.post(posturl, headers=headers, cookies=cookies, data=data)
soup = BeautifulSoup(r4.text, 'html.parser')
for div in soup.findAll("div", {"class": "appDialogRepeaterRowContent"}):
labels = []
values = []
for label in div.select(".appAttrLabelBox"):
label = label.text
if label == "Company Name":
label = label.replace("Company ", "")
labels.append(label)
for value in div.select(".appAttrValue"):
values.append(value.text)
values = [value.strip() for value in values]
comb = dict(zip(labels,values))
comb["company_number"] = row_values['Entity Number']
if tab == 2:
scraperwiki.sqlite.save(["Name","company_number","Residential Address"], comb, table_name="directors")
elif tab == 3:
scraperwiki.sqlite.save(["Name", "company_number", "Residential or Registered Office Address"], comb, table_name="shareholders")
elif tab == 5:
scraperwiki.sqlite.save(["Name","company_number","Residential Address"], comb, table_name="secretaries")
scraperwiki.sqlite.save(["company_number"], {"company_number": row_values['Entity Number'],"error": False}, table_name="companies")
except:
scraperwiki.sqlite.save(["company_number"], {"company_number": row_values['Entity Number'],"error": True}, table_name="companies")
print "Error. Skipping company. Starting from " + str(index) + "."
print traceback.format_exc()
rescrape_companies = rescrape_companies.ix[index + 1:]
started_scrape = True
break
except:
print traceback.format_exc()
time.sleep(15)
print "Error. Restarting scraper..."