added new files and trying to split the code up a little

openhealthcare · Oct 18, 2011 · 3101fb9 · 3101fb9
1 parent 064db07
commit 3101fb9
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 118 deletions.
diff --git a/scraper/nice.py b/scraper/nice.py
@@ -14,140 +14,118 @@
 Source code originally from https://scraperwiki.com/scrapers/nice_scraper/
 """
 import os, sys
-from optparse import OptionParser
-from ConfigParser import ConfigParser
+
+from scraper import Scraper
 
 import lxml.html
 import lxml.etree
 import urllib2
 import urlparse
 
-settings = {}
-
-###############################################################################
-# Options parser for command line arguments, we definitely want a config file
-# and so we'll complain if we aren't given one.
-###############################################################################
-parser = OptionParser()
-parser.add_option("-c", "--config", dest="config",
-                  help="Path to the configuration file", metavar="FILE")
-parser.add_option("-v", "--verbose",
-                  action="store_true", dest="verbose", default=False,
-                  help="Write verbose output")                  
-(options, args) = parser.parse_args()
-
-if options.verbose:
-    print 'Starting nice_checker'
-
-if (not options.config) or (not os.path.exists(options.config)):
-    print """
-            Can't run unless we have a config file
-            Please specify the path to the file with the -c option\n"""
+class NiceScraper( Scraper ):
+
+    base_url = 'http://www.nice.org.uk/guidance/index.jsp?action=ByType&type=2&status=3&p=off' 
+    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+
+    def run(self):
+        print '+ Running NICE Scraper'
+
+        print '+ Fetching base page'        
+        html = self.get_content(self.base_url)
+        #print html
+        root = lxml.html.fromstring(html)
+        rows = root.cssselect("table#row tr")
+        headers = [ th.text_content().strip()  for th in rows[0] ]
+        assert headers == ['Ref', 'Title', 'Date Issued', 'Review'], headers
+
+        for n, row in enumerate(rows[1:]):
+            assert row[1][0].tag == "a", lxml.html.tostring(row)
+            data = dict(zip(headers, [ td.text_content().strip()  for td in row ]))
+            data["link"] = row[1][0].attrib.get("href")
+            data['Date Issued'] = self.month_date(data['Date Issued'])
+            if data['Review'] in ["", "TBC"]:
+                data.pop('Review')
+            else:
+                data['Review'] = self.month_date(data['Review'])
+            data["rownumber"] = n
+            pdata = self.guide_from_page(data["link"])
+            data.update(pdata)
 
-    sys.exit(1)
-
-###############################################################################
-# Load the configuration file and setup the require settings
-###############################################################################
-config = ConfigParser()
-config.readfp( open( options.config ) )
+            print data
+#            scraperwiki.sqlite.save(["rownumber"], data)
 
-output_folder = config.get('scraper_settings', 'pdf_output')
-output_folder = os.path.join( os.path.dirname(__file__), output_folder)
-output_folder = os.path.abspath(output_folder)
-if options.verbose:
-    print 'Will save output files to %s' % (output_folder,)
-settings[ 'output_folder' ] = output_folder
 
+    def month_date(self, d):
+        return "%s-%02d" % (d[4:], self.months.index(d[:3])+1)
 
-"""
-base_url = 'http://www.nice.org.uk/guidance/index.jsp?action=ByType&type=2&status=3&p=off' 
-months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
+    def get_direct_pdf(self, durl):
+        html = self.get_content(durl)
+        root = lxml.html.fromstring(html)
+        dlink =root.cssselect("div.contentInner a#hyperlink")
+        return urlparse.urljoin(durl, dlink[0].attrib.get("href"))
 
-def getheadingsfrompdf(pdfurl):
-    pdfdata = urllib2.urlopen(pdfurl).read()
-    pdfxml = scraperwiki.pdftoxml(pdfdata)
-    root = lxml.etree.fromstring(pdfxml)
+    def get_headings_from_pdf(self, pdfurl):
+        return ""
+
+#        pdfdata = urllib2.urlopen(pdfurl).read()
+#        pdfxml = scraperwiki.pdftoxml(pdfdata)
+#        root = lxml.etree.fromstring(pdfxml)
 
-    ldata = [ ]
-    for page in root:
-        for el in page:
-                # needs also to do concatenation between headings that run to two lines, 
-                # and handle headings with italics in them <i>
-            if el.tag == "text" and el.attrib.get("font") == "10" and len(el) == 1 and el[0].tag == "b":
-                data = {"pdfurl":pdfurl, "pagenumber":int(page.attrib.get("number")), "heading":el[0].text}
-                ldata.append(data)
-    scraperwiki.sqlite.save(["pdfurl", "pagenumber", "heading"], ldata, "subheadings")
-
-
-
-def monthdate(d):
-    assert len(d) == 8, d
-    return "%s-%02d" % (d[4:], months.index(d[:3])+1)
-
-def GetDirectPDF(durl):
-    html = urllib2.urlopen(durl).read()
-    root = lxml.html.fromstring(html)
-    dlink =root.cssselect("div.contentInner a#hyperlink")
-    assert dlink
-    return urlparse.urljoin(durl, dlink[0].attrib.get("href"))
-
-
-def guidefrompage(purl):
-    phtml = urllib2.urlopen(purl).read()
-    proot = lxml.html.fromstring(phtml)
-    uloptions = proot.cssselect("div.guidance-content ul.options")
-    pdata = { }
-    for li in uloptions[0].cssselect("li"):
-        if not li.text:
-            continue
-        key = li.text.strip()
-        if key == 'No documents found':
-            continue
-        if key in ['Full guideline', 'Distribution List']:
-            continue
-        assert key in ["NICE guidance written for patients and carers", 'Quick reference guide', 'NICE guideline', 'Full guideline'], key
-        for a in li:
-            assert a.tag == "a"
-            format = a.text.strip()
-            if format == "Fformat MS Word":
+#        ldata = [ ]
+#        for page in root:
+#            for el in page:
+                    # needs also to do concatenation between headings that run to two lines, 
+                    # and handle headings with italics in them <i>
+#                if el.tag == "text" and el.attrib.get("font") == "10" and len(el) == 1 and el[0].tag == "b":
+#                    data = {"pdfurl":pdfurl, "pagenumber":int(page.attrib.get("number")), "heading":el[0].text}
+#                    ldata.append(data)
+#        scraperwiki.sqlite.save(["pdfurl", "pagenumber", "heading"], ldata, "subheadings")
+
+
+    def guide_from_page(self, purl):
+        print '+ Getting guide from ' + purl
+        phtml = self.get_content(purl)
+        proot = lxml.html.fromstring(phtml)
+        uloptions = proot.cssselect("div.guidance-content ul.options")
+        pdata = { }
+        for li in uloptions[0].cssselect("li"):
+            if not li.text:
                 continue
-            if format == "documents":
+            key = li.text.strip()
+            if key == 'No documents found':
                 continue
-            assert format in ["PDF format", "MS Word format"], format
-            ckey = "%s - %s" % (key, format[:-7])
-            dpdf = a.attrib.get("href")  # holding page
-            pdata[ckey] = dpdf
-            if format == "PDF format":
-                pdfurl = GetDirectPDF(dpdf)
-                pdata[key+" - PDF"] = pdfurl
-                getheadingsfrompdf(pdfurl)
-
-    return pdata
-
-def Main():
-    html = urllib2.urlopen(base_url).read()
-    #print html
-    root = lxml.html.fromstring(html)
-    rows = root.cssselect("table#row tr")
-    headers = [ th.text_content().strip()  for th in rows[0] ]
-    assert headers == ['Ref', 'Title', 'Date Issued', 'Review'], headers
+            if key in ['Full guideline', 'Distribution List']:
+                continue
+            assert key in ["NICE guidance written for patients and carers", 'Quick reference guide', 'NICE guideline', 'Full guideline'], key
+            for a in li:
+                assert a.tag == "a"
+                format = a.text.strip()
+                if format == "Fformat MS Word":
+                    continue
+                if format == "documents":
+                    continue
+                assert format in ["PDF format", "MS Word format"], format
+                ckey = "%s - %s" % (key, format[:-7])
+                dpdf = a.attrib.get("href")  # holding page
+                pdata[ckey] = dpdf
+                if format == "PDF format":
+                    pdfurl = self.get_direct_pdf(dpdf)
+                    pdata[key+" - PDF"] = pdfurl
+                    self.get_headings_from_pdf(pdfurl)
+
+        return pdata
+
+if __name__ == '__main__':
+    n = NiceScraper()
+    n.run()
+
+"""
+
+
+
+
 
 #Quick reference guide - PDF
-    for n, row in enumerate(rows[1:]):
-        assert row[1][0].tag == "a", lxml.html.tostring(row)
-        data = dict(zip(headers, [ td.text_content().strip()  for td in row ]))
-        data["link"] = row[1][0].attrib.get("href")
-        data['Date Issued'] = monthdate(data['Date Issued'])
-        if data['Review'] in ["", "TBC"]:
-            data.pop('Review')
-        else:
-            data['Review'] = monthdate(data['Review'])
-        data["rownumber"] = n
-        pdata = guidefrompage(data["link"])
-        
-        data.update(pdata)
-        scraperwiki.sqlite.save(["rownumber"], data)
 
 Main()
 """
diff --git a/scraper/rcog.py b/scraper/rcog.py
@@ -0,0 +1,8 @@
+import os, sys
+
+from scraper import Scraper
+
+import lxml.html
+import lxml.etree
+import urllib2
+import urlparse
diff --git a/scraper/scraper.py b/scraper/scraper.py
@@ -0,0 +1,40 @@
+import os, sys
+from optparse import OptionParser
+from ConfigParser import ConfigParser
+import urllib2
+import urlparse
+
+
+class Scraper(object):
+
+    def __init__(self, *args, **kwargs):
+        parser = OptionParser()
+        parser.add_option("-c", "--config", dest="config",
+                          help="Path to the configuration file", metavar="FILE")
+        parser.add_option("-v", "--verbose",
+                          action="store_true", dest="verbose", default=False,
+                          help="Write verbose output")                  
+        (self.settings, args) = parser.parse_args()
+        self.load_config()
+
+    def load_config(self):
+        if (not self.settings.config) or (not os.path.exists(self.settings.config)):
+            print """
+                    Can't run unless we have a config file
+                    Please specify the path to the file with the -c option\n"""
+
+            sys.exit(1)
+
+        config = ConfigParser()
+        config.readfp( open( self.settings.config ) )
+
+        output_folder = config.get('scraper_settings', 'pdf_output')
+        output_folder = os.path.join( os.path.dirname(__file__), output_folder)
+        output_folder = os.path.abspath(output_folder)
+        if self.settings.verbose:
+            print 'Will save output files to %s' % (output_folder,)
+        self.settings.output_folder = output_folder
+
+    def get_content(self, url):
+        return urllib2.urlopen(url).read()
+