Skip to content

Commit

Permalink
you can now use phantomjs to down the HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
peterbe committed Feb 1, 2013
1 parent cc9c956 commit 2d4e806
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 23 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ docs/_build/
mincss.egg-info/
/build/
/dist/
simple.js
9 changes: 9 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ API
``url(/background.png)`` then the CSS will be rewritten to become
``url(http://cdn.cloudware.com/background.png)``

* ``phantomjs=None``
If ``True`` will default to ``phantomjs``, If a string it's
assume it's the path to the executable ``phantomjs`` path.

* ``phantomjs_options={}``
Additional options/switches to the ``phantomjs`` command. This
has to be a dict. So, for example ``{'script-encoding': 'latin1'}``
becomes ``--script-encoding=latin1``.

Instances of this allows you to use the following methods:

* ``process(*urls)``
Expand Down
9 changes: 8 additions & 1 deletion docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@ Changelog
=========


v0.6.0 (2013-02-01)
-------------------

New option, `phantomjs` that allows you to download the HTML using
phantomjs instead of regular Python's urllib.


v0.5.0 (2013-01-24)
-----------------
-------------------

New option `preserve_remote_urls` to `Processor()` class. Useful when
the hrefs in link tags are of different domain than the URL you're
Expand Down
8 changes: 8 additions & 0 deletions docs/gettingstarted.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,11 @@ Now, let's use ``mincss`` as follows::

As you can see, it automatically discovered that the ``input:hover``
and the ``.bar`` selectors are not used in the HTML DOM tree.

If you have ``phantomjs`` installed and can do things like
``$ phantomjs --help`` on your command line you can run mincss like
this::

>>> from mincss.processor import Processor
>>> p = Processor(phantomjs=True)
>>> p.process('http://localhost/page-with-javascript.html')
8 changes: 3 additions & 5 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,9 @@ which CSS is actually being used. It does this by download the whole
page(s) and finds all inline and linked CSS and analyses which
selectors are still in use somewhere.

It currently does the analysis entirely statically and does not
support Javascript.

``mincss`` is currently under development and the API is possibly
changing.
Optionally, you can use `PhantomJS <http://phantomjs.org/>`_ to
download the HTML source from a URL which means it will at least load
all the Javascript that gets executed onload.

Installation should be as simple as ``pip install mincss``. The code
is `available on Github <https://github.com/peterbe/mincss>`_.
Expand Down
2 changes: 1 addition & 1 deletion mincss/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.0'
__version__ = '0.6.0'
6 changes: 6 additions & 0 deletions mincss/download.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
var page = require('webpage').create();
page.open(phantom.args[0], function () {
//page.render('screenshot.png');
console.log(page.content);
phantom.exit();
});
55 changes: 53 additions & 2 deletions mincss/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import random
import re
import urlparse
import time
import subprocess
from lxml import etree
from lxml.cssselect import CSSSelector, SelectorSyntaxError, ExpressionError
import urllib
Expand All @@ -19,6 +21,12 @@
)


DOWNLOAD_JS = os.path.join(
os.path.dirname(__file__),
'download.js'
)


class ParserError(Exception):
"""happens when we fail to parse the HTML"""
pass
Expand All @@ -38,14 +46,20 @@ def _get_random_string():

class Processor(object):

def __init__(self, debug=False, preserve_remote_urls=True):
def __init__(self,
debug=False,
preserve_remote_urls=True,
phantomjs=False,
phantomjs_options=None):
self.debug = debug
self.preserve_remote_urls = preserve_remote_urls
self.tab = ' ' * 4
self.blocks = {}
self.inlines = []
self.links = []
self._bodies = []
self.phantomjs = phantomjs
self.phantomjs_options = phantomjs_options

def _download(self, url):
try:
Expand All @@ -60,6 +74,40 @@ def _download(self, url):
except IOError:
raise IOError(url)

def _download_with_phantomjs(self, url):
if self.phantomjs is True:
# otherwise, assume it's a path
self.phantomjs = 'phantomjs'
elif not os.path.isfile(self.phantomjs):
raise IOError('%s is not a path to phantomjs' % self.phantomjs)

command = [self.phantomjs]
if self.phantomjs_options:
if 'load-images' not in self.phantomjs_options:
# not entirely sure if this helps but there can't be any point
# at all to download image for mincss
self.phantomjs_options['load-images'] = 'no'
for key, value in self.phantomjs_options.items():
command.append('--%s=%s' % (key, value))

command.append(DOWNLOAD_JS)
assert ' ' not in url
command.append(url)

t0 = time.time()
process = subprocess.Popen(
' '.join(command),
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
out, err = process.communicate()
t1 = time.time()
if self.debug:
print "Took", t1 - t0, "seconds to download with PhantomJS"

return unicode(out, 'utf-8')

def process(self, *urls):
for url in urls:
self.process_url(url)
Expand Down Expand Up @@ -91,7 +139,10 @@ def process(self, *urls):
)

def process_url(self, url):
html = self._download(url)
if self.phantomjs:
html = self._download_with_phantomjs(url)
else:
html = self._download(url)
self.process_html(html.strip(), url=url)

def process_html(self, html, url):
Expand Down
47 changes: 33 additions & 14 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
#!/usr/bin/env python
import os
from mincss.processor import Processor

import sys

if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("url", type=str,
help="URL to process")
parser.add_argument("--outputdir", action="store",
default="./output",
help="directory where to put output (default ./output)")
parser.add_argument("-v", "--verbose", action="store_true",
help="increase output verbosity")
# make sure it's running the mincss here and not anything installed
sys.path.insert(0, os.path.dirname(__file__))
from mincss.processor import Processor

args = parser.parse_args()
p = Processor(debug=args.verbose)
def run(args):
options = {'debug': args.verbose}
if args.phantomjs_path:
options['phantomjs'] = args.phantomjs_path
elif args.phantomjs:
options['phantomjs'] = True
p = Processor(**options)
p.process(args.url)
for inline in p.inlines:
print "ON", inline.url
Expand Down Expand Up @@ -47,3 +44,25 @@
(len(link.before), len(link.after),
len(link.before) - len(link.after))
)

return 0


if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("url", type=str,
help="URL to process")
parser.add_argument("--outputdir", action="store",
default="./output",
help="directory where to put output (default ./output)")
parser.add_argument("-v", "--verbose", action="store_true",
help="increase output verbosity")
parser.add_argument("--phantomjs", action="store_true",
help="Use PhantomJS to download the source")
parser.add_argument("--phantomjs-path", action="store",
default="",
help="Where is the phantomjs executable")

args = parser.parse_args()
sys.exit(run(args))
11 changes: 11 additions & 0 deletions tests/fake_phantomjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python
import urllib

def run(url):
print urllib.urlopen(url).read()
return 0

if __name__ == '__main__':
import sys
url = sys.argv[-1]
sys.exit(run(url))
30 changes: 30 additions & 0 deletions tests/test_mincss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import os
import unittest
from nose.tools import eq_, ok_

# make sure it's running the mincss here and not anything installed
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from mincss.processor import Processor


HERE = os.path.dirname(__file__)

PHANTOMJS = os.path.join(HERE, 'fake_phantomjs')


class TestMinCSS(unittest.TestCase):

Expand Down Expand Up @@ -191,3 +197,27 @@ def test_preserve_remote_urls(self):
ok_('url("file:///east.png")' in after)
url = 'file://' + HERE + '/west.png'
ok_('url("%s")' % url in after)

def test_download_with_phantomjs(self):
html = os.path.join(HERE, 'one.html')
url = 'file://' + html
p = Processor(
phantomjs=PHANTOMJS,
phantomjs_options={'cookies-file': 'bla'}
)
p.process(url)
# on line 7 there inline css starts
# one.html only has 1 block on inline CSS
inline = p.inlines[0]
lines_after = inline.after.strip().splitlines()
eq_(inline.line, 7)
ok_(len(inline.after) < len(inline.before))

# compare line by line
expect = '''
h1, h2, h3 { text-align: center; }
h3 { font-family: serif; }
h2 { color:red }
'''
for i, line in enumerate(expect.strip().splitlines()):
eq_(line.strip(), lines_after[i].strip())

0 comments on commit 2d4e806

Please sign in to comment.