Skip to content

Commit

Permalink
Push to github, using code borrowed from @andylolz
Browse files Browse the repository at this point in the history
  • Loading branch information
markbrough committed May 19, 2017
1 parent 36672bb commit 349c570
Showing 1 changed file with 58 additions and 10 deletions.
68 changes: 58 additions & 10 deletions scraper.py
Expand Up @@ -3,6 +3,15 @@
from lxml import html
import scraperwiki
import requests
import shutil
from git import Repo
from os.path import join
from os import environ, remove
from glob import glob
import unicodecsv

output_dir = "output"
data_dir = "data"

def get_page():
r = requests.get(URL)
Expand All @@ -14,18 +23,57 @@ def clean_fy(value):
if value == "NA": return "Unknown"
fy_start, fy_end = value.split("-")
return fy_start


def save_csv(name, codelist, fieldnames):
for row in codelist:
writer.writerow(row)

def init_git_repo():
shutil.rmtree(output_dir, ignore_errors=True)
git = Repo.init(output_dir).git
git.remote('add', 'origin', 'https://{}@github.com/markbrough/country-fiscal-years.git'.format(environ.get('MORPH_GH_API_KEY')))
try:
git.pull('origin', 'update')
except:
git.pull('origin', 'gh-pages')
git.checkout(b='update')
for to_remove in glob(join(data_dir, '*.csv')):
remove(to_remove)

def push_to_github():
url = 'https://api.github.com/repos/markbrough/country-fiscal-years/pulls'
git = Repo.init(output_dir).git
git.add('.')
git.config('user.email', environ.get('MORPH_GH_EMAIL'))
git.config('user.name', environ.get('MORPH_GH_USERNAME'))
git.commit(m='Update')
git.push('origin', 'update')
payload = {
'title': 'Merge in latest changes',
'body': 'This is an auto- pull request.',
'head': 'update',
'base': 'gh-pages',
}
r = requests.post(url, json=payload, auth=(environ.get('MORPH_GH_USERNAME'), environ.get('MORPH_GH_API_KEY')))
shutil.rmtree(output_dir, ignore_errors=True)

def run():
page = get_page()
table = page.xpath("//table")[0]
for row in table.xpath("//tr")[1:]:
country_code = row.get("id")
cols = row.xpath("td")
country_name = cols[0].find("a").text
fiscal_year = cols[1].text
scraperwiki.sqlite.save(unique_keys=['code'], data={
"code": country_code.upper(),
"name": country_name,
"fy_start": clean_fy(fiscal_year)})

with open(join(data_dir, 'countries_fiscal_years.csv'), 'w', encoding='utf-8') as f:
writer = unicodecsv.DictWriter(f, fieldnames=["code","name","fy_start"],
quoting=csv.QUOTE_ALL)
writer.writeheader()
for row in table.xpath("//tr")[1:]:
country_code = row.get("id")
cols = row.xpath("td")
country_name = cols[0].find("a").text
fiscal_year = cols[1].text
data = {"code": country_code.upper(),
"name": country_name,
"fy_start": clean_fy(fiscal_year)}
scraperwiki.sqlite.save(unique_keys=['code'], data=data)
writer.writerow(data)

run()

0 comments on commit 349c570

Please sign in to comment.