Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mxfh committed Oct 26, 2014
1 parent 44360d4 commit d36a28a
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def iriToUri(iri):

domain = "http://www.archiv-buergerbewegung.de/"
url = domain + "index.php/demonstrationen"
print (url)
html = scraperwiki.scrape(url) # download the html content of the page
soup = BeautifulSoup(html) # load the html into beautifulsoup

Expand All @@ -34,16 +33,22 @@ def iriToUri(iri):
bezirke.append({"name": bezirk, "url": url})

for b in bezirke:
html = scraperwiki.scrape(b["url"])
url = b["url"]
bezirk = b["name"]
print (url)
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html).find(id="overlay-content")
for ortli in soup.find_all("li"):
url= iriToUri(domain + ortli.a['href'])
url= iriToUri(domain + ortli.a['href'])
title = ortli.a.contents[0]
ort = re.sub(' \(.*$', '', title)
orte.append({"name": ort, "url": url}) # put the values extracted into a list
orte.append({"name": ort, "bezirk": bezirk, "url": url}) # put the values extracted into a list

for o in orte:
html = scraperwiki.scrape(o['url'])
ort = o["name"]
bezirk = o["bezirk"]
url = o['url']
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html).find(id="overlay-content")
for evententries in soup.find_all("div", class_="entry"):
datum = re.sub('^[ ]*', '', evententries.find('b', text="Datum:").next_sibling) ## remove leading spaces
Expand All @@ -60,9 +65,9 @@ def iriToUri(iri):
date = ttuple.date()
obj = {
"id" : i,
"uniq" : o["bezirk"] + o["ort"] + datum,
"bezirk": o["bezirk"],
"ort": o["ort"],
"uniq" : bezirk + ort + datum,
"bezirk": bezirk,
"ort": ort,
"datum": datum,
"jahr": ttuple.timetuple().tm_year,
"monat": ttuple.timetuple().tm_mon,
Expand All @@ -78,7 +83,7 @@ def iriToUri(iri):
"einwohner": einwohner,
"demo": demo,
"kirche": kirche,
"url": o['url']
"url": url
}
print (obj["id"],obj["uniq"],obj["teilnehmer"])
events.append(obj)
Expand Down

0 comments on commit d36a28a

Please sign in to comment.