Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

you can now use phantomjs to down the HTML

  • Loading branch information...
commit 2d4e806c32480694b748a5e187491a8c327b4e75 1 parent cc9c956
@peterbe authored
View
1  .gitignore
@@ -4,3 +4,4 @@ docs/_build/
mincss.egg-info/
/build/
/dist/
+simple.js
View
9 docs/api.rst
@@ -21,6 +21,15 @@ API
``url(/background.png)`` then the CSS will be rewritten to become
``url(http://cdn.cloudware.com/background.png)``
+ * ``phantomjs=None``
+ If ``True`` will default to ``phantomjs``, If a string it's
+ assume it's the path to the executable ``phantomjs`` path.
+
+ * ``phantomjs_options={}``
+ Additional options/switches to the ``phantomjs`` command. This
+ has to be a dict. So, for example ``{'script-encoding': 'latin1'}``
+ becomes ``--script-encoding=latin1``.
+
Instances of this allows you to use the following methods:
* ``process(*urls)``
View
9 docs/changelog.rst
@@ -6,8 +6,15 @@ Changelog
=========
+v0.6.0 (2013-02-01)
+-------------------
+
+New option, `phantomjs` that allows you to download the HTML using
+phantomjs instead of regular Python's urllib.
+
+
v0.5.0 (2013-01-24)
------------------
+-------------------
New option `preserve_remote_urls` to `Processor()` class. Useful when
the hrefs in link tags are of different domain than the URL you're
View
8 docs/gettingstarted.rst
@@ -38,3 +38,11 @@ Now, let's use ``mincss`` as follows::
As you can see, it automatically discovered that the ``input:hover``
and the ``.bar`` selectors are not used in the HTML DOM tree.
+
+If you have ``phantomjs`` installed and can do things like
+``$ phantomjs --help`` on your command line you can run mincss like
+this::
+
+ >>> from mincss.processor import Processor
+ >>> p = Processor(phantomjs=True)
+ >>> p.process('http://localhost/page-with-javascript.html')
View
8 docs/index.rst
@@ -11,11 +11,9 @@ which CSS is actually being used. It does this by download the whole
page(s) and finds all inline and linked CSS and analyses which
selectors are still in use somewhere.
-It currently does the analysis entirely statically and does not
-support Javascript.
-
-``mincss`` is currently under development and the API is possibly
-changing.
+Optionally, you can use `PhantomJS <http://phantomjs.org/>`_ to
+download the HTML source from a URL which means it will at least load
+all the Javascript that gets executed onload.
Installation should be as simple as ``pip install mincss``. The code
is `available on Github <https://github.com/peterbe/mincss>`_.
View
2  mincss/__init__.py
@@ -1 +1 @@
-__version__ = '0.5.0'
+__version__ = '0.6.0'
View
6 mincss/download.js
@@ -0,0 +1,6 @@
+var page = require('webpage').create();
+page.open(phantom.args[0], function () {
+ //page.render('screenshot.png');
+ console.log(page.content);
+ phantom.exit();
+});
View
55 mincss/processor.py
@@ -5,6 +5,8 @@
import random
import re
import urlparse
+import time
+import subprocess
from lxml import etree
from lxml.cssselect import CSSSelector, SelectorSyntaxError, ExpressionError
import urllib
@@ -19,6 +21,12 @@
)
+DOWNLOAD_JS = os.path.join(
+ os.path.dirname(__file__),
+ 'download.js'
+)
+
+
class ParserError(Exception):
"""happens when we fail to parse the HTML"""
pass
@@ -38,7 +46,11 @@ def _get_random_string():
class Processor(object):
- def __init__(self, debug=False, preserve_remote_urls=True):
+ def __init__(self,
+ debug=False,
+ preserve_remote_urls=True,
+ phantomjs=False,
+ phantomjs_options=None):
self.debug = debug
self.preserve_remote_urls = preserve_remote_urls
self.tab = ' ' * 4
@@ -46,6 +58,8 @@ def __init__(self, debug=False, preserve_remote_urls=True):
self.inlines = []
self.links = []
self._bodies = []
+ self.phantomjs = phantomjs
+ self.phantomjs_options = phantomjs_options
def _download(self, url):
try:
@@ -60,6 +74,40 @@ def _download(self, url):
except IOError:
raise IOError(url)
+ def _download_with_phantomjs(self, url):
+ if self.phantomjs is True:
+ # otherwise, assume it's a path
+ self.phantomjs = 'phantomjs'
+ elif not os.path.isfile(self.phantomjs):
+ raise IOError('%s is not a path to phantomjs' % self.phantomjs)
+
+ command = [self.phantomjs]
+ if self.phantomjs_options:
+ if 'load-images' not in self.phantomjs_options:
+ # not entirely sure if this helps but there can't be any point
+ # at all to download image for mincss
+ self.phantomjs_options['load-images'] = 'no'
+ for key, value in self.phantomjs_options.items():
+ command.append('--%s=%s' % (key, value))
+
+ command.append(DOWNLOAD_JS)
+ assert ' ' not in url
+ command.append(url)
+
+ t0 = time.time()
+ process = subprocess.Popen(
+ ' '.join(command),
+ shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE
+ )
+ out, err = process.communicate()
+ t1 = time.time()
+ if self.debug:
+ print "Took", t1 - t0, "seconds to download with PhantomJS"
+
+ return unicode(out, 'utf-8')
+
def process(self, *urls):
for url in urls:
self.process_url(url)
@@ -91,7 +139,10 @@ def process(self, *urls):
)
def process_url(self, url):
- html = self._download(url)
+ if self.phantomjs:
+ html = self._download_with_phantomjs(url)
+ else:
+ html = self._download(url)
self.process_html(html.strip(), url=url)
def process_html(self, html, url):
View
47 run.py
@@ -1,21 +1,18 @@
#!/usr/bin/env python
import os
-from mincss.processor import Processor
-
+import sys
-if __name__ == '__main__':
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("url", type=str,
- help="URL to process")
- parser.add_argument("--outputdir", action="store",
- default="./output",
- help="directory where to put output (default ./output)")
- parser.add_argument("-v", "--verbose", action="store_true",
- help="increase output verbosity")
+# make sure it's running the mincss here and not anything installed
+sys.path.insert(0, os.path.dirname(__file__))
+from mincss.processor import Processor
- args = parser.parse_args()
- p = Processor(debug=args.verbose)
+def run(args):
+ options = {'debug': args.verbose}
+ if args.phantomjs_path:
+ options['phantomjs'] = args.phantomjs_path
+ elif args.phantomjs:
+ options['phantomjs'] = True
+ p = Processor(**options)
p.process(args.url)
for inline in p.inlines:
print "ON", inline.url
@@ -47,3 +44,25 @@
(len(link.before), len(link.after),
len(link.before) - len(link.after))
)
+
+ return 0
+
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument("url", type=str,
+ help="URL to process")
+ parser.add_argument("--outputdir", action="store",
+ default="./output",
+ help="directory where to put output (default ./output)")
+ parser.add_argument("-v", "--verbose", action="store_true",
+ help="increase output verbosity")
+ parser.add_argument("--phantomjs", action="store_true",
+ help="Use PhantomJS to download the source")
+ parser.add_argument("--phantomjs-path", action="store",
+ default="",
+ help="Where is the phantomjs executable")
+
+ args = parser.parse_args()
+ sys.exit(run(args))
View
11 tests/fake_phantomjs
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+import urllib
+
+def run(url):
+ print urllib.urlopen(url).read()
+ return 0
+
+if __name__ == '__main__':
+ import sys
+ url = sys.argv[-1]
+ sys.exit(run(url))
View
30 tests/test_mincss.py
@@ -1,11 +1,17 @@
import os
import unittest
from nose.tools import eq_, ok_
+
+# make sure it's running the mincss here and not anything installed
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from mincss.processor import Processor
HERE = os.path.dirname(__file__)
+PHANTOMJS = os.path.join(HERE, 'fake_phantomjs')
+
class TestMinCSS(unittest.TestCase):
@@ -191,3 +197,27 @@ def test_preserve_remote_urls(self):
ok_('url("file:///east.png")' in after)
url = 'file://' + HERE + '/west.png'
ok_('url("%s")' % url in after)
+
+ def test_download_with_phantomjs(self):
+ html = os.path.join(HERE, 'one.html')
+ url = 'file://' + html
+ p = Processor(
+ phantomjs=PHANTOMJS,
+ phantomjs_options={'cookies-file': 'bla'}
+ )
+ p.process(url)
+ # on line 7 there inline css starts
+ # one.html only has 1 block on inline CSS
+ inline = p.inlines[0]
+ lines_after = inline.after.strip().splitlines()
+ eq_(inline.line, 7)
+ ok_(len(inline.after) < len(inline.before))
+
+ # compare line by line
+ expect = '''
+ h1, h2, h3 { text-align: center; }
+ h3 { font-family: serif; }
+ h2 { color:red }
+ '''
+ for i, line in enumerate(expect.strip().splitlines()):
+ eq_(line.strip(), lines_after[i].strip())
Please sign in to comment.
Something went wrong with that request. Please try again.