diff --git a/prose_wc/tests/tests.py b/prose_wc/tests/tests.py index dabf34d..dd40222 100644 --- a/prose_wc/tests/tests.py +++ b/prose_wc/tests/tests.py @@ -63,7 +63,7 @@ def test_output_json(self, mock_dump, mock_print): self.assertTrue(mock_print.called_once) @patch.object(wc, '_mockable_print') - @patch.object(yaml, 'dump') + @patch.object(yaml, 'safe_dump') def test_output_yaml(self, mock_dump, mock_print): wc.prose_wc(wc.setup(['-f', 'yaml', self.plaintext])) mock_dump.assert_called_once_with({ diff --git a/prose_wc/wc.py b/prose_wc/wc.py index dd06416..57ba6fa 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -1,6 +1,9 @@ #!/usr/bin/python -from __future__ import print_function +from __future__ import ( + print_function, + unicode_literals, +) import argparse from bs4 import BeautifulSoup import json @@ -23,6 +26,9 @@ apostrophes """ +NEWLINE_PATTERN = re.compile(r'[\r|\n|\r\n]') +NEWPARA_PATTERN = re.compile(r'[\r|\n|\r\n]{2}') + def _mockable_print(arg): """A print function that can be mocked in tests. @@ -62,8 +68,8 @@ def setup(argv): help='output format.') parser.add_argument('-i', '--indent', type=int, nargs='?', default=4, help='indentation depth (default: 4).') - parser.add_argument('file', type=argparse.FileType('r'), - help='file to count (or - for STDIN)') + parser.add_argument('file', type=argparse.FileType('rb'), + help='file to parse (or - for STDIN)') return parser.parse_args(argv) @@ -77,7 +83,7 @@ def prose_wc(args): return 1 if args.split_hyphens: INTERSTITIAL_PUNCTUATION.append(re.compile(r'-')) - content = args.file.read() + content = args.file.read().decode('utf-8') filename = args.file.name body = strip_frontmatter(content) parsed = markdown_to_text(body) @@ -89,7 +95,7 @@ def prose_wc(args): update_file(filename, result, content, args.indent) else: _mockable_print({ - 'yaml': yaml.dump(result, default_flow_style=False, + 'yaml': yaml.safe_dump(result, default_flow_style=False, indent=args.indent), 'json': json.dumps(result, indent=args.indent), 'default': default_dump(result), @@ -151,7 +157,7 @@ def wc(filename, contents, parsed=None, is_jekyll=False): body = parsed.strip() if parsed else contents.strip() # Strip the body down to just words - words = re.sub(r'\n', ' ', body) + words = NEWLINE_PATTERN.sub(' ', body) words = re.sub(r'\s+', ' ', words) for punctuation in INTERSTITIAL_PUNCTUATION: words = re.sub(punctuation, ' ', words) @@ -164,7 +170,7 @@ def wc(filename, contents, parsed=None, is_jekyll=False): 'counts': { 'file': filename, 'type': fmt, - 'paragraphs': len(contents.strip().split('\n\n')), + 'paragraphs': len(NEWPARA_PATTERN.split(contents.strip())), 'words': len(re.split('\s+', words)), 'characters_real': len(real_characters), 'characters_total': len(words), @@ -195,7 +201,7 @@ def update_file(filename, result, content, indent): # Set the frontmatter part backed to the stringified version of the # frontmatter object parts[1] = '\n{}'.format( - yaml.dump(frontmatter, default_flow_style=False, indent=indent)) + yaml.safe_dump(frontmatter, default_flow_style=False, indent=indent)) result = '---'.join(parts) # Write everything back to the file diff --git a/setup.py b/setup.py index 03a4d6b..3a90970 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='prose-wc', - version='0.2.1', + version='0.3.0', description='Jekyll-aware prose wordcount utility', long_description=long_description, author='Madison Scott-Clary', diff --git a/tox.ini b/tox.ini index 70d491b..804dc00 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = 2.7,3.4,3.5 +envlist = py27, py35 [testenv] deps = nose