Re #6897. Adding logging and ability to download files.

mantidproject · Apr 30, 2013 · 3e738da · 3e738da
1 parent b698a21
commit 3e738da
Show file tree

Hide file tree

Showing 2 changed files with 131 additions and 1 deletion.
diff --git a/Code/Mantid/docs/qtassistant/checkmissingimg.py b/Code/Mantid/docs/qtassistant/checkmissingimg.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
+import logging
 import os
 import re
+from fetchimg import Fetcher
 
 from mediawiki import IMG_NOT_FOUND
 
@@ -25,6 +27,9 @@ def processHtml(htmldir, filename):
     @param filename The html file to parse and look for missing images in.
     @returns All Missing image files.
     """
+    logger = logging.getLogger("processHtml")
+    logger.info("processHtml(%s, %s)" % (htmldir, filename))
+
     # read in the entire html file
     handle = file(filename, 'r')
     text = handle.read()
@@ -49,24 +54,28 @@ def processHtml(htmldir, filename):
         options.append(candidate[:end])
     # that are set to IMG_NOT_FOUND
     if IMG_NOT_FOUND in text:
+        logger.info("IMAGE_NOT_FOUND in '%s'" % filename)
         candidates = []
         index = 0
         while index >= 0:
             index = text.find(IMG_NOT_FOUND, index)
-            end = text.find("</figure>")
+            end = text.find("</figure>", index)
             if end < index or index < 0:
                 break
             figs = re.findall(r'Missing image:\s+(.+)</figcaption>',
                               text[index:end])
             candidates.extend(figs)
             index += len(IMG_NOT_FOUND)
+            logger.info("CANDIDATES: %s" % str(candidates))
         options.extend(candidates)
 
     # add them to the list of missing images if not found
     results = []
     for candidate in options:
         candidate = os.path.join(htmldir, candidate)
+        logger.info("looking for '%s'" % candidate)
         if not os.path.exists(candidate):
+            logger.info("candidate = '%s' not found" % candidate)
             results.append(candidate)
 
     # return everything that isn't found
@@ -84,6 +93,15 @@ def processHtml(htmldir, filename):
     parser.add_option('', '--nosummary', dest='summary',
                       default=True, action="store_false",
                       help="Turn off the summary information")
+    parser.add_option('', '--download', dest="downloadarea",
+                      default=None,
+                      help="Download the missing images from the mantid wiki to the specified directory")
+    parser.add_option('', '--loglevel', dest="loglevel",
+                      default="warn",
+                      help="Set the logging level (options are 'debug', 'info', 'warn', 'error')")
+    parser.add_option('', '--logfile', dest="logfile",
+                      default=None,
+                      help="Set filename to log to")
     (options, args) = parser.parse_args()
 
     # get the html base directory
@@ -93,6 +111,19 @@ def processHtml(htmldir, filename):
     if not os.path.exists(htmldir):
         parser.error("Must specify an existing html directory")
 
+    # configure the logger
+    if options.loglevel.startswith('debug'):
+        options.loglevel=logging.DEBUG
+    elif options.loglevel.startswith('info'):
+        options.loglevel=logging.INFO
+    elif options.loglevel.startswith('warn'):
+        options.loglevel=logging.WARNING
+    elif options.loglevel.startswith('error'):
+        options.loglevel=logging.ERROR
+    else:
+        parser.error("Failed to specify valid log level: '%s'" % options.loglevel)
+    logging.basicConfig(filename=options.logfile, level=options.loglevel)
+
     # get the list of html files
     htmlfiles = getHtml(htmldir)
     if options.summary:
@@ -115,3 +146,8 @@ def processHtml(htmldir, filename):
             print os.path.split(filename)[-1]
         else:
             print filename
+
+    if options.downloadarea is not None:
+        for filename in missing:
+            fetcher = Fetcher(filename)
+            fetcher.download(options.downloadarea)
diff --git a/Code/Mantid/docs/qtassistant/fetchimg.py b/Code/Mantid/docs/qtassistant/fetchimg.py
@@ -0,0 +1,94 @@
+import logging
+import os
+import re
+import urllib
+
+WIKI_BASE_URL = "http://www.mantidproject.org"
+DOWN_BASE_URL = "http://download.mantidproject.org/"
+ALGO_BASE_URL = DOWN_BASE_URL + "algorithm_screenshots/ScreenShotImages"
+
+def wiki_fig_key(stuff):
+    filename = os.path.split(stuff)[-1]
+    # aaa will come first for things that aren't thumbnails
+    prefix = "aaa/"
+    # thumnails are bad
+    if stuff.startswith('/images/thumb/'):
+        prefix = "zzz/"
+        index = filename.find('px-')
+        if index > 0:
+            resolution = int(filename[:index])
+            filename = filename[index+3:]
+        else:
+            resolution = 0
+        prefix += "%06d/" % resolution
+    stuff = prefix + filename
+    #print stuff
+    return stuff
+
+class Fetcher:
+    def __init__(self, filename, shrinkname=True):
+        self._logger = logging.getLogger(__name__+'.'+self.__class__.__name__)
+        self.filename = filename
+        shortfilename = str(os.path.split(self.filename)[-1])
+        if shrinkname:
+            self.filename = shortfilename
+        self._logger.info(self.filename)
+
+        # download wiki page name
+        url = WIKI_BASE_URL + "/File:" + self.filename
+        handle = urllib.urlopen(url)
+        text = handle.read()
+        handle.close()
+        text_len = len(text)
+
+        candidates = []
+
+        # find the location of the file from the image tags
+        candidates.extend(self._getWikiCandidates(shortfilename, text, "<img alt=", "/>"))
+
+        # find the location of the file from the image tags
+        candidates.extend(self._getWikiCandidates(shortfilename, text, '<a href="/images/', '">'))
+
+        # sort them according to favorites and remove duplicates
+        candidates = list(set(candidates))
+        candidates.sort(key=wiki_fig_key)
+        self._logger.info("candidates:" + str(candidates))
+
+        # always choose the first one
+        if len(candidates) <= 0:
+            raise RuntimeError("Failed to find any candidates")
+        rel_link = candidates[0]
+        self._logger.debug("rel_link = '%s'" % rel_link)
+        self.url = WIKI_BASE_URL + rel_link
+
+    def _getWikiCandidates(self, shortname, text, starttag, endtag):
+        candidates = []
+        text_len = len(text)
+        index = 0
+        while index >= 0 and index < text_len:
+            index = text.find(starttag, index)
+            end = text.find(endtag, index)
+            self._logger.debug("indices: %d to %d" % (index, end))
+            if end < index or index < 0:
+                break
+            stuff = text[index:end]
+            index = end
+            self._logger.debug(stuff)
+            #if not shortfilename in stuff:
+            #    continue
+            attrs = stuff.split()
+            for attr in attrs:
+                if '/images/' in attr:
+                    rel_link = attr.split('=')[-1][1:]
+                    if rel_link.endswith('"') or rel_link.endswith("'"):
+                        rel_link = rel_link[:-1]
+                    if not "archive" in rel_link:
+                        if not rel_link.startswith('/images/thumb'):
+                            candidates.append(rel_link)
+        return candidates
+
+
+    def download(self, destdir):
+        destname = os.path.join(destdir, self.filename)
+        print "Downloading %s from %s" % (destname, self.url)
+        urllib.urlretrieve(self.url, filename=destname)