Skip to content

Commit

Permalink
Re #6897. Adding logging and ability to download files.
Browse files Browse the repository at this point in the history
  • Loading branch information
peterfpeterson committed Apr 30, 2013
1 parent b698a21 commit 3e738da
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 1 deletion.
38 changes: 37 additions & 1 deletion Code/Mantid/docs/qtassistant/checkmissingimg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python
import logging
import os
import re
from fetchimg import Fetcher

from mediawiki import IMG_NOT_FOUND

Expand All @@ -25,6 +27,9 @@ def processHtml(htmldir, filename):
@param filename The html file to parse and look for missing images in.
@returns All Missing image files.
"""
logger = logging.getLogger("processHtml")
logger.info("processHtml(%s, %s)" % (htmldir, filename))

# read in the entire html file
handle = file(filename, 'r')
text = handle.read()
Expand All @@ -49,24 +54,28 @@ def processHtml(htmldir, filename):
options.append(candidate[:end])
# that are set to IMG_NOT_FOUND
if IMG_NOT_FOUND in text:
logger.info("IMAGE_NOT_FOUND in '%s'" % filename)
candidates = []
index = 0
while index >= 0:
index = text.find(IMG_NOT_FOUND, index)
end = text.find("</figure>")
end = text.find("</figure>", index)
if end < index or index < 0:
break
figs = re.findall(r'Missing image:\s+(.+)</figcaption>',
text[index:end])
candidates.extend(figs)
index += len(IMG_NOT_FOUND)
logger.info("CANDIDATES: %s" % str(candidates))
options.extend(candidates)

# add them to the list of missing images if not found
results = []
for candidate in options:
candidate = os.path.join(htmldir, candidate)
logger.info("looking for '%s'" % candidate)
if not os.path.exists(candidate):
logger.info("candidate = '%s' not found" % candidate)
results.append(candidate)

# return everything that isn't found
Expand All @@ -84,6 +93,15 @@ def processHtml(htmldir, filename):
parser.add_option('', '--nosummary', dest='summary',
default=True, action="store_false",
help="Turn off the summary information")
parser.add_option('', '--download', dest="downloadarea",
default=None,
help="Download the missing images from the mantid wiki to the specified directory")
parser.add_option('', '--loglevel', dest="loglevel",
default="warn",
help="Set the logging level (options are 'debug', 'info', 'warn', 'error')")
parser.add_option('', '--logfile', dest="logfile",
default=None,
help="Set filename to log to")
(options, args) = parser.parse_args()

# get the html base directory
Expand All @@ -93,6 +111,19 @@ def processHtml(htmldir, filename):
if not os.path.exists(htmldir):
parser.error("Must specify an existing html directory")

# configure the logger
if options.loglevel.startswith('debug'):
options.loglevel=logging.DEBUG
elif options.loglevel.startswith('info'):
options.loglevel=logging.INFO
elif options.loglevel.startswith('warn'):
options.loglevel=logging.WARNING
elif options.loglevel.startswith('error'):
options.loglevel=logging.ERROR
else:
parser.error("Failed to specify valid log level: '%s'" % options.loglevel)
logging.basicConfig(filename=options.logfile, level=options.loglevel)

# get the list of html files
htmlfiles = getHtml(htmldir)
if options.summary:
Expand All @@ -115,3 +146,8 @@ def processHtml(htmldir, filename):
print os.path.split(filename)[-1]
else:
print filename

if options.downloadarea is not None:
for filename in missing:
fetcher = Fetcher(filename)
fetcher.download(options.downloadarea)
94 changes: 94 additions & 0 deletions Code/Mantid/docs/qtassistant/fetchimg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
import os
import re
import urllib

WIKI_BASE_URL = "http://www.mantidproject.org"
DOWN_BASE_URL = "http://download.mantidproject.org/"
ALGO_BASE_URL = DOWN_BASE_URL + "algorithm_screenshots/ScreenShotImages"

def wiki_fig_key(stuff):
filename = os.path.split(stuff)[-1]
# aaa will come first for things that aren't thumbnails
prefix = "aaa/"
# thumnails are bad
if stuff.startswith('/images/thumb/'):
prefix = "zzz/"
index = filename.find('px-')
if index > 0:
resolution = int(filename[:index])
filename = filename[index+3:]
else:
resolution = 0
prefix += "%06d/" % resolution
stuff = prefix + filename
#print stuff
return stuff

class Fetcher:
def __init__(self, filename, shrinkname=True):
self._logger = logging.getLogger(__name__+'.'+self.__class__.__name__)
self.filename = filename
shortfilename = str(os.path.split(self.filename)[-1])
if shrinkname:
self.filename = shortfilename
self._logger.info(self.filename)

# download wiki page name
url = WIKI_BASE_URL + "/File:" + self.filename
handle = urllib.urlopen(url)
text = handle.read()
handle.close()
text_len = len(text)

candidates = []

# find the location of the file from the image tags
candidates.extend(self._getWikiCandidates(shortfilename, text, "<img alt=", "/>"))

# find the location of the file from the image tags
candidates.extend(self._getWikiCandidates(shortfilename, text, '<a href="/images/', '">'))

# sort them according to favorites and remove duplicates
candidates = list(set(candidates))
candidates.sort(key=wiki_fig_key)
self._logger.info("candidates:" + str(candidates))

# always choose the first one
if len(candidates) <= 0:
raise RuntimeError("Failed to find any candidates")
rel_link = candidates[0]
self._logger.debug("rel_link = '%s'" % rel_link)
self.url = WIKI_BASE_URL + rel_link

def _getWikiCandidates(self, shortname, text, starttag, endtag):
candidates = []
text_len = len(text)
index = 0
while index >= 0 and index < text_len:
index = text.find(starttag, index)
end = text.find(endtag, index)
self._logger.debug("indices: %d to %d" % (index, end))
if end < index or index < 0:
break
stuff = text[index:end]
index = end
self._logger.debug(stuff)
#if not shortfilename in stuff:
# continue
attrs = stuff.split()
for attr in attrs:
if '/images/' in attr:
rel_link = attr.split('=')[-1][1:]
if rel_link.endswith('"') or rel_link.endswith("'"):
rel_link = rel_link[:-1]
if not "archive" in rel_link:
if not rel_link.startswith('/images/thumb'):
candidates.append(rel_link)
return candidates


def download(self, destdir):
destname = os.path.join(destdir, self.filename)
print "Downloading %s from %s" % (destname, self.url)
urllib.urlretrieve(self.url, filename=destname)

0 comments on commit 3e738da

Please sign in to comment.