Skip to content

Commit

Permalink
WIP collect supranational assemblies
Browse files Browse the repository at this point in the history
  • Loading branch information
Duncan Parkes committed Feb 19, 2015
1 parent e3f6a90 commit b14d1ce
Showing 1 changed file with 27 additions and 11 deletions.
38 changes: 27 additions & 11 deletions scraper.py
Expand Up @@ -6,15 +6,10 @@
import requests
from bs4 import BeautifulSoup

source_url = 'http://en.wikipedia.org/wiki/List_of_legislatures_by_country'
html = requests.get(source_url).text
soup = BeautifulSoup(html, 'html.parser')

title_span = soup.find('span', {'id': 'Legislatures_of_UN_member_states'})
data_table = title_span.parent.find_next('table')


class WikiTable(object):
legistlature_type = None

def __init__(self, table_element):
self.element = table_element
self.column_indices = dict(enumerate(
Expand All @@ -27,7 +22,7 @@ def store_data(self, keys=None, id_keys=None):
and id_keys as a list of keys to provide an id when
concatenated."""
remaining_rowspans = [0] * len(self.column_indices)
data = {}
data = {'legistlature type': self.legistlature_type}

for row in self.element.find_all('tr')[1:]:
tds = row.find_all('td')
Expand All @@ -53,13 +48,34 @@ def store_data(self, keys=None, id_keys=None):

scraperwiki.sqlite.save(unique_keys=('id',), data=data)


class UNMembersTable(WikiTable):
def get_data(self, key, td):
return ' '.join(td.stripped_strings)


UNMembersTable(data_table).store_data(
class UNMembersTable(WikiTable):
legistlature_type = 'UN member'


class SupranationalTable(WikiTable):
legistlature_type = 'Supranational'


source_url = 'http://en.wikipedia.org/wiki/List_of_legislatures_by_country'
html = requests.get(source_url).text
soup = BeautifulSoup(html, 'html.parser')

un_members_title_span = soup.find('span', {'id': 'Legislatures_of_UN_member_states'})
un_members_table = un_members_title_span.parent.find_next('table')

UNMembersTable(un_members_table).store_data(
keys=('Country', 'Name of house'),
id_keys=('Country', 'Name of house'),
)

supranational_span = soup.find('span', {'id': 'Supranational_legislatures'})
supranational_table = supranational_span.parent.find_next('table')

SupranationalTable(supranational_table).store_data(
keys=('Organisation', 'Name of house'),
id_keys=('Organisation', 'Name of house'),
)

0 comments on commit b14d1ce

Please sign in to comment.