Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

version 0.8 is much faster

  • Loading branch information...
commit 75c1e90a0973c7e72e811223557896d7f5215971 1 parent 2786c69
@peterbe authored
View
5 docs/api.rst
@@ -30,6 +30,11 @@ API
has to be a dict. So, for example ``{'script-encoding': 'latin1'}``
becomes ``--script-encoding=latin1``.
+ * ``optimize_lookup=True``
+ If true, will make a set of all ids and class names in all
+ processed documents and use these to avoid some expensive CSS
+ query searches.
+
Instances of this allows you to use the following methods:
* ``process(*urls)``
View
7 docs/changelog.rst
@@ -5,6 +5,13 @@
Changelog
=========
+v0.8.0 (2013-02-26)
+-------------------
+
+Much faster! Unless you pass ``Processor(optimize_lookup=False)`` when
+creating the processor instance. See
+http://www.peterbe.com/plog/mincss-0.8
+
v0.7.0 (2013-02-13)
-------------------
View
6 docs/features.rst
@@ -31,12 +31,12 @@ Supported Features and Limitations
**Things that don't yet work:**
* Javascript events that manipulate the DOM tree.
- A future version might use a parser that supports Javascript but
- likely it will never be perfect.
+ You can use PhantomJS to do the downloading but it still won't get
+ every possible piece of HTML generated based on complex Javascript.
* keyframes are always left untouched even if it's never referenced
-* Broken HTML or broken/invalid CSS isn't support and good results can
+* Broken HTML or broken/invalid CSS isn't supported and good results can
not be guaranteed.
View
2  mincss/__init__.py
@@ -1 +1 @@
-__version__ = '0.7.0'
+__version__ = '0.8.0'
View
49 mincss/processor.py
@@ -1,6 +1,5 @@
import os
import sys
-import collections
import functools
import random
import re
@@ -14,6 +13,8 @@
RE_FIND_MEDIA = re.compile("(@media.+?)(\{)", re.DOTALL | re.MULTILINE)
RE_NESTS = re.compile('@(-|keyframes).*?({)', re.DOTALL | re.M)
+RE_CLASS_DEF = re.compile('\.([\w-]+)')
+RE_ID_DEF = re.compile('#([\w-]+)')
EXCEPTIONAL_SELECTORS = (
@@ -50,7 +51,8 @@ def __init__(self,
debug=False,
preserve_remote_urls=True,
phantomjs=False,
- phantomjs_options=None):
+ phantomjs_options=None,
+ optimize_lookup=True):
self.debug = debug
self.preserve_remote_urls = preserve_remote_urls
self.tab = ' ' * 4
@@ -58,6 +60,9 @@ def __init__(self,
self.inlines = []
self.links = []
self._bodies = []
+ self.optimize_lookup = optimize_lookup
+ self._all_ids = set()
+ self._all_classes = set()
self.phantomjs = phantomjs
self.phantomjs_options = phantomjs_options
@@ -157,6 +162,16 @@ def process_html(self, html, url):
lines = html.splitlines()
body, = CSSSelector('body')(page)
self._bodies.append(body)
+ if self.optimize_lookup:
+ for each in body.iter():
+ id = each.attrib.get('id')
+ if id:
+ self._all_ids.add(id)
+ classes = each.attrib.get('class')
+ if classes:
+ for class_ in classes.split():
+ self._all_classes.add(class_)
+
for style in CSSSelector('style')(page):
first_line = style.text.strip().splitlines()[0]
for i, line in enumerate(lines):
@@ -174,7 +189,10 @@ def process_html(self, html, url):
key = (link_url, link.attrib['href'])
self.blocks[key] = self._download(link_url)
if self.preserve_remote_urls:
- self.blocks[key] = self._rewrite_urls(self.blocks[key], link_url)
+ self.blocks[key] = self._rewrite_urls(
+ self.blocks[key],
+ link_url
+ )
def _rewrite_urls(self, content, link_url):
"""Suppose you run mincss on www.example.org and it references:
@@ -248,11 +266,6 @@ def commentmatcher(match):
outside = nearest_close > nearest_open
else:
raise Exception("can this happen?!")
- print repr(match.group())
- print "nearest", (nearest_close, nearest_open)
- print nearest_close < nearest_open
- #print "next", (next_close, next_open)
- print
if outside:
temp_key = '@%scomment{}' % _get_random_string()
@@ -394,12 +407,26 @@ def _get_contents(self, match, original_content):
)
def _found(self, bodies, selector):
+ if self._all_ids:
+ try:
+ id_ = RE_ID_DEF.findall(selector)[0]
+ if id_ not in self._all_ids:
+ # don't bother then
+ return False
+ except IndexError:
+ pass
+
+ if self._all_classes:
+ for class_ in RE_CLASS_DEF.findall(selector):
+ if class_ not in self._all_classes:
+ # don't bother then
+ return False
+
#print "SELECTOR", repr(selector)
- r = self.__found(bodies, selector)
- #print "R", repr(r)
+ r = self._selector_query_found(bodies, selector)
return r
- def __found(self, bodies, selector):
+ def _selector_query_found(self, bodies, selector):
selector = selector.split(':')[0]
if '}' in selector:
View
12 proxy/app.py
@@ -9,6 +9,7 @@
import urllib
import urlparse
import shutil
+import time
from lxml import etree
from lxml.cssselect import CSSSelector
@@ -30,7 +31,7 @@
CLOSING_REGEX = re.compile(
'(<(script|iframe|textarea|div)\s*[^>]+/>)',
- flags=re.M|re.DOTALL
+ flags=re.M | re.DOTALL
)
@@ -60,12 +61,19 @@ def proxy(path):
if query:
url += '?%s' % query
logging.info('Downloading %s' % url)
+ t0 = time.time()
html = download(url)
+ t1 = time.time()
+ print "%.4f seconds to download" % (t1 - t0)
- p = Processor(debug=False)
+ p = Processor(debug=False, optimize_lookup=True)
# since we've already download the HTML
+ t0 = time.time()
p.process_html(html, url)
+ t1 = time.time()
p.process()
+ t2 = time.time()
+ print "%.4f seconds to parse and process" % (t2 - t1)
collect_stats = request.args.get('MINCSS_STATS', False)
stats = []
View
37 run.py
@@ -1,11 +1,13 @@
#!/usr/bin/env python
import os
import sys
+import time
# make sure it's running the mincss here and not anything installed
sys.path.insert(0, os.path.dirname(__file__))
from mincss.processor import Processor
+
def run(args):
options = {'debug': args.verbose}
if args.phantomjs_path:
@@ -13,7 +15,10 @@ def run(args):
elif args.phantomjs:
options['phantomjs'] = True
p = Processor(**options)
+ t0 = time.time()
p.process(args.url)
+ t1 = time.time()
+ print "TOTAL TIME ", t1 - t0
for inline in p.inlines:
print "ON", inline.url
print "AT line", inline.line
@@ -23,7 +28,6 @@ def run(args):
print inline.after
print
- _here = os.path.dirname(__file__)
output_dir = args.outputdir
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
@@ -33,9 +37,11 @@ def run(args):
#print link.before
#print "AFTER ".ljust(79, '-')
#print link.after
- with open(os.path.join(output_dir, link.href.split('/')[-1]), 'w') as f:
+ orig_name = link.href.split('/')[-1]
+ with open(os.path.join(output_dir, orig_name), 'w') as f:
f.write(link.after)
- with open(os.path.join(output_dir, 'before_' + link.href.split('/')[-1]), 'w') as f:
+ before_name = 'before_' + link.href.split('/')[-1]
+ with open(os.path.join(output_dir, before_name), 'w') as f:
f.write(link.before)
print "Files written to", output_dir
print
@@ -51,18 +57,19 @@ def run(args):
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
- parser.add_argument("url", type=str,
- help="URL to process")
- parser.add_argument("--outputdir", action="store",
- default="./output",
- help="directory where to put output (default ./output)")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="increase output verbosity")
- parser.add_argument("--phantomjs", action="store_true",
- help="Use PhantomJS to download the source")
- parser.add_argument("--phantomjs-path", action="store",
- default="",
- help="Where is the phantomjs executable")
+ add = parser.add_argument
+ add("url", type=str,
+ help="URL to process")
+ add("--outputdir", action="store",
+ default="./output",
+ help="directory where to put output (default ./output)")
+ add("-v", "--verbose", action="store_true",
+ help="increase output verbosity")
+ add("--phantomjs", action="store_true",
+ help="Use PhantomJS to download the source")
+ add("--phantomjs-path", action="store",
+ default="",
+ help="Where is the phantomjs executable")
args = parser.parse_args()
sys.exit(run(args))
Please sign in to comment.
Something went wrong with that request. Please try again.