Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
markbrough committed May 19, 2017
0 parents commit 8171135
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
URL = "https://www.cia.gov/library/publications/the-world-factbook/fields/2080.html"

from lxml import html
import scraperwiki
import requests

def get_page():
r = requests.get(URL)
return html.fromstring(r.text)

def clean_fy(value):
value = value.strip()
if value == "calendar year": return "1 January"
if value == "NA": return "Unknown"
fy_start, fy_end = value.split("-")
return fy_start

def run():
page = get_page()
table = page.xpath("//table")[0]
for row in table.xpath("//tr")[1:]:
country_code = row.get("id")
cols = row.xpath("td")
country_name = cols[0].find("a").text
fiscal_year = cols[1].text
scraperwiki.sqlite.save(unique_keys=['code'], data={
"code": country_code.upper(),
"name": "country_name",
"fy_start": clean_fy(fiscal_year)})

run()

0 comments on commit 8171135

Please sign in to comment.