From 461f535dc626474be7f9f08f1939270fff316ea5 Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 11:11:10 -0700 Subject: [PATCH 1/6] Handle newlines betteR --- prose_wc/wc.py | 9 ++++++--- setup.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/prose_wc/wc.py b/prose_wc/wc.py index dd06416..386b649 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -23,6 +23,9 @@ apostrophes """ +NEWLINE_PATTERN = re.compile(r'[\r|\n|\r\n]') +NEWPARA_PATTERN = re.compile(r'[\r|\n|\r\n]{2}') + def _mockable_print(arg): """A print function that can be mocked in tests. @@ -107,7 +110,7 @@ def markdown_to_text(body): Plaintext with all tags and frills removed """ # Turn our input into HTML - md = markdown.markdown(body, extensions=[ + md = markdown.markdown(body.decode('utf-8'), extensions=[ 'markdown.extensions.extra' ]) @@ -151,7 +154,7 @@ def wc(filename, contents, parsed=None, is_jekyll=False): body = parsed.strip() if parsed else contents.strip() # Strip the body down to just words - words = re.sub(r'\n', ' ', body) + words = NEWLINE_PATTERN.sub(' ', body) words = re.sub(r'\s+', ' ', words) for punctuation in INTERSTITIAL_PUNCTUATION: words = re.sub(punctuation, ' ', words) @@ -164,7 +167,7 @@ def wc(filename, contents, parsed=None, is_jekyll=False): 'counts': { 'file': filename, 'type': fmt, - 'paragraphs': len(contents.strip().split('\n\n')), + 'paragraphs': len(NEWPARA_PATTERN.split(contents.strip())), 'words': len(re.split('\s+', words)), 'characters_real': len(real_characters), 'characters_total': len(words), diff --git a/setup.py b/setup.py index 03a4d6b..b7f6241 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='prose-wc', - version='0.2.1', + version='0.2.2', description='Jekyll-aware prose wordcount utility', long_description=long_description, author='Madison Scott-Clary', From f7592e862ac7bd194c4f9fa53282342e94cb353e Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 11:28:43 -0700 Subject: [PATCH 2/6] Unicode heck --- prose_wc/wc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prose_wc/wc.py b/prose_wc/wc.py index 386b649..baf695b 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -110,7 +110,7 @@ def markdown_to_text(body): Plaintext with all tags and frills removed """ # Turn our input into HTML - md = markdown.markdown(body.decode('utf-8'), extensions=[ + md = markdown.markdown(unicode(body).decode('utf-8'), extensions=[ 'markdown.extensions.extra' ]) From 16e0472ccccb17672c857a0251ffcc23feb599d9 Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 11:42:41 -0700 Subject: [PATCH 3/6] Unicode HECK --- prose_wc/tests/tests.py | 2 +- prose_wc/wc.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/prose_wc/tests/tests.py b/prose_wc/tests/tests.py index dabf34d..dd40222 100644 --- a/prose_wc/tests/tests.py +++ b/prose_wc/tests/tests.py @@ -63,7 +63,7 @@ def test_output_json(self, mock_dump, mock_print): self.assertTrue(mock_print.called_once) @patch.object(wc, '_mockable_print') - @patch.object(yaml, 'dump') + @patch.object(yaml, 'safe_dump') def test_output_yaml(self, mock_dump, mock_print): wc.prose_wc(wc.setup(['-f', 'yaml', self.plaintext])) mock_dump.assert_called_once_with({ diff --git a/prose_wc/wc.py b/prose_wc/wc.py index baf695b..fc68f82 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -1,6 +1,9 @@ #!/usr/bin/python -from __future__ import print_function +from __future__ import ( + print_function, + unicode_literals, +) import argparse from bs4 import BeautifulSoup import json @@ -92,7 +95,7 @@ def prose_wc(args): update_file(filename, result, content, args.indent) else: _mockable_print({ - 'yaml': yaml.dump(result, default_flow_style=False, + 'yaml': yaml.safe_dump(result, default_flow_style=False, indent=args.indent), 'json': json.dumps(result, indent=args.indent), 'default': default_dump(result), @@ -110,7 +113,7 @@ def markdown_to_text(body): Plaintext with all tags and frills removed """ # Turn our input into HTML - md = markdown.markdown(unicode(body).decode('utf-8'), extensions=[ + md = markdown.markdown(unicode(body, 'utf-8'), extensions=[ 'markdown.extensions.extra' ]) @@ -198,7 +201,7 @@ def update_file(filename, result, content, indent): # Set the frontmatter part backed to the stringified version of the # frontmatter object parts[1] = '\n{}'.format( - yaml.dump(frontmatter, default_flow_style=False, indent=indent)) + yaml.safe_dump(frontmatter, default_flow_style=False, indent=indent)) result = '---'.join(parts) # Write everything back to the file From 4f6967f0cf3317a7a95a33479119384fc6bd4309 Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 11:48:47 -0700 Subject: [PATCH 4/6] UNICODE HECK OFF --- prose_wc/wc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prose_wc/wc.py b/prose_wc/wc.py index fc68f82..ec8a1a9 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -113,7 +113,7 @@ def markdown_to_text(body): Plaintext with all tags and frills removed """ # Turn our input into HTML - md = markdown.markdown(unicode(body, 'utf-8'), extensions=[ + md = markdown.markdown(body.encode('utf-8'), extensions=[ 'markdown.extensions.extra' ]) From ff5beef27970890e2f68029f276b77203f03a2d7 Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 12:59:00 -0700 Subject: [PATCH 5/6] Fixed??? (tox's fault) --- prose_wc/wc.py | 8 ++++---- tox.ini | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/prose_wc/wc.py b/prose_wc/wc.py index ec8a1a9..57ba6fa 100755 --- a/prose_wc/wc.py +++ b/prose_wc/wc.py @@ -68,8 +68,8 @@ def setup(argv): help='output format.') parser.add_argument('-i', '--indent', type=int, nargs='?', default=4, help='indentation depth (default: 4).') - parser.add_argument('file', type=argparse.FileType('r'), - help='file to count (or - for STDIN)') + parser.add_argument('file', type=argparse.FileType('rb'), + help='file to parse (or - for STDIN)') return parser.parse_args(argv) @@ -83,7 +83,7 @@ def prose_wc(args): return 1 if args.split_hyphens: INTERSTITIAL_PUNCTUATION.append(re.compile(r'-')) - content = args.file.read() + content = args.file.read().decode('utf-8') filename = args.file.name body = strip_frontmatter(content) parsed = markdown_to_text(body) @@ -113,7 +113,7 @@ def markdown_to_text(body): Plaintext with all tags and frills removed """ # Turn our input into HTML - md = markdown.markdown(body.encode('utf-8'), extensions=[ + md = markdown.markdown(body, extensions=[ 'markdown.extensions.extra' ]) diff --git a/tox.ini b/tox.ini index 70d491b..804dc00 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = 2.7,3.4,3.5 +envlist = py27, py35 [testenv] deps = nose From 8dc5fd8ae28322ebd554124e4fbb5360413801a1 Mon Sep 17 00:00:00 2001 From: Madison Scott-Clary Date: Thu, 10 Nov 2016 13:01:59 -0700 Subject: [PATCH 6/6] Bump revno to minor; fixes py2 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b7f6241..3a90970 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='prose-wc', - version='0.2.2', + version='0.3.0', description='Jekyll-aware prose wordcount utility', long_description=long_description, author='Madison Scott-Clary',