Skip to content
Permalink
Browse files

WIP collect supranational assemblies

  • Loading branch information
Duncan Parkes
Duncan Parkes committed Feb 19, 2015
1 parent e3f6a90 commit 402664bd8638e908f2cd00fdd77848500c8d5bdf
Showing with 34 additions and 14 deletions.
  1. +34 −14 scraper.py
@@ -6,15 +6,10 @@
import requests
from bs4 import BeautifulSoup

source_url = 'http://en.wikipedia.org/wiki/List_of_legislatures_by_country'
html = requests.get(source_url).text
soup = BeautifulSoup(html, 'html.parser')

title_span = soup.find('span', {'id': 'Legislatures_of_UN_member_states'})
data_table = title_span.parent.find_next('table')


class WikiTable(object):
legistlature_type = None

def __init__(self, table_element):
self.element = table_element
self.column_indices = dict(enumerate(
@@ -27,7 +22,7 @@ def store_data(self, keys=None, id_keys=None):
and id_keys as a list of keys to provide an id when
concatenated."""
remaining_rowspans = [0] * len(self.column_indices)
data = {}
data = {'legistlature type': self.legistlature_type}

for row in self.element.find_all('tr')[1:]:
tds = row.find_all('td')
@@ -47,19 +42,44 @@ def store_data(self, keys=None, id_keys=None):
key = self.column_indices.get(col)
data[key] = self.get_data(key, td)

data['id'] = hashlib.md5(
(u'-'.join([data[id_key] for id_key in id_keys])).encode('utf_8')
).hexdigest()
try:
data['id'] = hashlib.md5(
(u'-'.join([data[id_key] for id_key in id_keys])).encode('utf_8')
).hexdigest()
except:
print data
raise

scraperwiki.sqlite.save(unique_keys=('id',), data=data)


class UNMembersTable(WikiTable):
def get_data(self, key, td):
return ' '.join(td.stripped_strings)


UNMembersTable(data_table).store_data(
class UNMembersTable(WikiTable):
legistlature_type = 'UN member'


class SupranationalTable(WikiTable):
legistlature_type = 'Supranational'


source_url = 'http://en.wikipedia.org/wiki/List_of_legislatures_by_country'
html = requests.get(source_url).text
soup = BeautifulSoup(html, 'html.parser')

un_members_title_span = soup.find('span', {'id': 'Legislatures_of_UN_member_states'})
un_members_table = un_members_title_span.parent.find_next('table')

UNMembersTable(un_members_table).store_data(
keys=('Country', 'Name of house'),
id_keys=('Country', 'Name of house'),
)

supranational_span = soup.find('span', {'id': 'Supranational_legislatures'})
supranational_table = supranational_span.parent.find_next('table')

SupranationalTable(supranational_table).store_data(
keys=('Organisation', 'Name of house'),
id_keys=('Organisation', 'Name of house'),
)

0 comments on commit 402664b

Please sign in to comment.
You can’t perform that action at this time.