Skip to content

Commit

Permalink
add scraper for conservative party website
Browse files Browse the repository at this point in the history
  • Loading branch information
bcampbell committed Mar 25, 2011
1 parent e8dd1f8 commit c79fbb5
Showing 1 changed file with 142 additions and 0 deletions.
142 changes: 142 additions & 0 deletions conservative_party.py
@@ -0,0 +1,142 @@
import dateutil.parser
from base import BaseScraper
import re
import urllib2
import lxml.html
from subprocess import Popen, PIPE, STDOUT
import email
import email.feedparser
import tempfile

# Scrape press release archive from the conservative party website.
#
# The site commits a litany of atrocities, which makes it a good
# example of a typical web site.
#
# Requirements:
# antiword (word->text converter)
# Email::Outlook::Message (CPAN module)
#
# As if press releases in word docs weren't bad enough, a lot (~75%?)
# of the conservative releases are in .msg files, which is an
# undocumented microsoft exchange (or outlook?) format.
#
# Luckily, Email::Outlook::Message in CPAN can parse them and turn
# them into something sane.
#
# To install it from CPAN (please forgive the perl-naivite here):
#
# $ sudo perl -MCPAN -e shell
# cpan> install Email::Outlook::Message
#
# it depends in turn on OLE::Storage_Lite and IO:All - cpan shell
# should ask you if you want to install them first.
#

class Scraper(BaseScraper):
long_name = "The Conservative Party"

def run(self):

# grab the latest 100
start_url = 'http://www.conservatives.com/Activist_centre/Press_Releases.aspx?take=100'

links_page = self.get_url(start_url)
for link in links_page.cssselect('.results .clfx h2 a'):
page_url = link.get('href')
title = unicode(link.text_content()).strip()
self.extract(page_url)
continue


def extract(self,url):
html = urllib2.urlopen(url).read()
# convert to unicode. page meta tags _claim_ page is iso-8859-1, but
# http headers say it's utf-8. sigh.
html = html.decode('utf-8') # (The headers are right.)
doc = lxml.html.fromstring(html)
doc.make_links_absolute(url)

date_txt = unicode(doc.cssselect(".lg-content .info")[0].text_content()).strip()
title = unicode(doc.cssselect(".lg-content h1")[0].text_content()).strip()

published = dateutil.parser.parse(date_txt)
# print " title: ",title
# print " date: ",published

# annoyingly, the press releases can be either:
# - on the page, as you'd expect
# - attached as a word doc. grr.
# - attached as a .msg file. WTF?
attached = doc.cssselect('.lnklist a')
txt = u''
if len(attached)>0:
# press release in attachement
attachment_url = attached[0].get('href')
txt = self.text_from_attachment(attachment_url)
else:
# press release on page
maintxt = doc.cssselect('.main-txt')[0]
for cruft in maintxt.cssselect('.botlnks'):
cruft.getparent().remove(cruft)
txt = unicode(maintxt.text_content())

self.upsert_press_release({
'published' : published,
'title' : title,
'text' : txt,
'source_link' : url,
})

def text_from_attachment(self,attachment_url):
attachment_url = attachment_url.replace(' ','%20')
f = urllib2.urlopen(attachment_url)
data = f.read()

m = re.compile(r'filename=(.*)').search(f.info()['Content-Disposition'])
filename = m.group(1)
if filename.lower().endswith('.doc'):
# word file - convert to text.
p = Popen(['antiword', '-m', 'UTF-8', '-'], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
txt = p.communicate(input=data)[0]

# kill the table at the top
pat = re.compile('^\s*[|].*?$',re.DOTALL|re.MULTILINE)
txt = pat.sub('',txt).strip()

txt = unicode(txt,'utf-8')

return txt

if filename.lower().endswith('.msg'):
# it's an exchange ".msg" file! oh, the humanity...

# first step - extract from silly .msg format
f = tempfile.NamedTemporaryFile()
f.write(data)
f.flush()
perlcmd = """use Email::Outlook::Message; print new Email::Outlook::Message('""" + f.name + """')->to_email_mime->as_string;"""
devnull = open('/dev/null', 'w')
p = Popen(['perl', '-w', '-e', perlcmd], stdout=PIPE, stdin=PIPE, stderr=devnull)
mimedata = p.communicate()[0]
f.close()

# second step - find the email text (just look for the first 'text/plain' part)
msg = email.message_from_string(mimedata)
for part in msg.walk():
if part.get_content_type()=='text/plain':
txt = part.get_payload()
# txt = txt.decode(part.get_content_charset())
txt = txt.decode('utf-8',"ignore") # KLUDGE!
txt = txt.strip()
return txt

assert False # uhoh - couldn't get email text

assert False # shouldn't get this far...

if __name__ == "__main__":
scraper = Scraper()
scraper.run()


0 comments on commit c79fbb5

Please sign in to comment.