add scraper for conservative party website

mediastandardstrust · Mar 25, 2011 · c79fbb5 · c79fbb5
1 parent e8dd1f8
commit c79fbb5
Showing 1 changed file with 142 additions and 0 deletions.
diff --git a/conservative_party.py b/conservative_party.py
@@ -0,0 +1,142 @@
+import dateutil.parser
+from base import BaseScraper
+import re
+import urllib2
+import lxml.html
+from subprocess import Popen, PIPE, STDOUT
+import email
+import email.feedparser
+import tempfile
+
+# Scrape press release archive from the conservative party website.
+#
+# The site commits a litany of atrocities, which makes it a good
+# example of a typical web site.
+#
+# Requirements:
+#   antiword  (word->text converter)
+#   Email::Outlook::Message  (CPAN module)
+#
+# As if press releases in word docs weren't bad enough, a lot (~75%?)
+# of the conservative releases are in .msg files, which is an
+# undocumented microsoft exchange (or outlook?) format.
+#
+# Luckily, Email::Outlook::Message in CPAN can parse them and turn
+# them into something sane.
+#
+# To install it from CPAN (please forgive the perl-naivite here):
+#
+#  $ sudo perl -MCPAN -e shell
+#  cpan> install Email::Outlook::Message
+#
+# it depends in turn on OLE::Storage_Lite and IO:All - cpan shell
+# should ask you if you want to install them first.
+#
+
+class Scraper(BaseScraper):
+    long_name = "The Conservative Party" 
+
+    def run(self):
+
+        # grab the latest 100
+        start_url = 'http://www.conservatives.com/Activist_centre/Press_Releases.aspx?take=100'
+
+        links_page = self.get_url(start_url)
+        for link in links_page.cssselect('.results .clfx h2 a'):
+            page_url = link.get('href')
+            title = unicode(link.text_content()).strip()
+            self.extract(page_url)
+            continue
+
+
+    def extract(self,url):
+        html = urllib2.urlopen(url).read()
+        # convert to unicode. page meta tags _claim_ page is iso-8859-1, but
+        # http headers say it's utf-8. sigh.
+        html = html.decode('utf-8')  # (The headers are right.)
+        doc = lxml.html.fromstring(html)
+        doc.make_links_absolute(url)
+
+        date_txt = unicode(doc.cssselect(".lg-content .info")[0].text_content()).strip()
+        title = unicode(doc.cssselect(".lg-content h1")[0].text_content()).strip()
+
+        published = dateutil.parser.parse(date_txt)
+#        print " title: ",title
+#        print " date: ",published
+
+        # annoyingly, the press releases can be either:
+        # - on the page, as you'd expect
+        # - attached as a word doc. grr.
+        # - attached as a .msg file. WTF?
+        attached = doc.cssselect('.lnklist a')
+        txt = u''
+        if len(attached)>0:
+            # press release in attachement
+            attachment_url = attached[0].get('href')
+            txt = self.text_from_attachment(attachment_url)
+        else:
+            # press release on page
+            maintxt = doc.cssselect('.main-txt')[0]
+            for cruft in maintxt.cssselect('.botlnks'):
+                cruft.getparent().remove(cruft)
+            txt = unicode(maintxt.text_content())
+
+        self.upsert_press_release({
+            'published'     : published,
+            'title'         : title,
+            'text'          : txt,
+            'source_link'   : url,
+        })
+
+    def text_from_attachment(self,attachment_url):
+        attachment_url = attachment_url.replace(' ','%20')
+        f = urllib2.urlopen(attachment_url)
+        data = f.read()
+
+        m = re.compile(r'filename=(.*)').search(f.info()['Content-Disposition'])
+        filename = m.group(1)
+        if filename.lower().endswith('.doc'):
+            # word file - convert to text.
+            p = Popen(['antiword', '-m', 'UTF-8', '-'], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
+            txt = p.communicate(input=data)[0]
+
+            # kill the table at the top
+            pat = re.compile('^\s*[|].*?$',re.DOTALL|re.MULTILINE)
+            txt = pat.sub('',txt).strip()
+
+            txt = unicode(txt,'utf-8')
+
+            return txt
+
+        if filename.lower().endswith('.msg'):
+            # it's an exchange ".msg" file! oh, the humanity...
+
+            # first step - extract from silly .msg format
+            f = tempfile.NamedTemporaryFile()
+            f.write(data)
+            f.flush()
+            perlcmd = """use Email::Outlook::Message; print new Email::Outlook::Message('""" + f.name + """')->to_email_mime->as_string;"""
+            devnull = open('/dev/null', 'w')
+            p = Popen(['perl', '-w', '-e', perlcmd], stdout=PIPE, stdin=PIPE, stderr=devnull)
+            mimedata = p.communicate()[0]
+            f.close()
+
+            # second step - find the email text (just look for the first 'text/plain' part)
+            msg = email.message_from_string(mimedata)
+            for part in msg.walk():
+                if part.get_content_type()=='text/plain':
+                    txt = part.get_payload()
+#                    txt = txt.decode(part.get_content_charset())
+                    txt = txt.decode('utf-8',"ignore")       # KLUDGE!
+                    txt = txt.strip()
+                    return txt
+
+            assert False        # uhoh - couldn't get email text
+
+        assert False    # shouldn't get this far...
+
+if __name__ == "__main__":
+    scraper = Scraper()
+    scraper.run()
+
+