Permalink
Browse files

All trailing whitespace is stored and ignored while diffing

  • Loading branch information...
orf committed Jul 29, 2013
1 parent d441222 commit 44e697fa9a8b580326bfaf6ffffeda3220c6c733
Showing with 33 additions and 19 deletions.
  1. +33 −19 src/lxml/html/diff.py
View
@@ -121,7 +121,7 @@ def compress_merge_back(tokens, tok):
else:
text = _unicode(last)
if last.trailing_whitespace:
text += ' '
text += last.trailing_whitespace
text += tok
merged = token(text,
pre_tags=last.pre_tags,
@@ -141,7 +141,7 @@ def markup_serialize_tokens(tokens, markup_func):
html = token.html()
html = markup_func(html, token.annotation)
if token.trailing_whitespace:
html += ' '
html += token.trailing_whitespace
yield html
for post in token.post_tags:
yield post
@@ -170,6 +170,7 @@ def htmldiff(old_html, new_html):
"""
old_html_tokens = tokenize(old_html)
new_html_tokens = tokenize(new_html)
print old_html_tokens, new_html_tokens
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
result = ''.join(result).strip()
return fixup_ins_del_tags(result)
@@ -221,7 +222,7 @@ def expand_tokens(tokens, equal=False):
yield pre
if not equal or not token.hide_when_equal:
if token.trailing_whitespace:
yield token.html() + ' '
yield token.html() + token.trailing_whitespace
else:
yield token.html()
for post in token.post_tags:
@@ -451,7 +452,7 @@ class token(_unicode):
# displayed diff if no change has occurred:
hide_when_equal = False
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=None):
obj = _unicode.__new__(cls, text)
if pre_tags is not None:
@@ -469,7 +470,8 @@ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False)
return obj
def __repr__(self):
return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags)
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
self.post_tags, self.trailing_whitespace)
def html(self):
return _unicode(self)
@@ -481,7 +483,7 @@ class tag_token(token):
is only represented in a document by a tag. """
def __new__(cls, tag, data, html_repr, pre_tags=None,
post_tags=None, trailing_whitespace=False):
post_tags=None, trailing_whitespace=None):
obj = token.__new__(cls, "%s: %s" % (type, data),
pre_tags=pre_tags,
post_tags=post_tags,
@@ -492,7 +494,7 @@ def __new__(cls, tag, data, html_repr, pre_tags=None,
return obj
def __repr__(self):
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
self.tag,
self.data,
self.html_repr,
@@ -569,6 +571,14 @@ def cleanup_html(html):
end_whitespace_re = re.compile(r'[ \t\n\r]$')
def split_trailing_whitespace(word):
"""
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
"""
stripped_length = len(word.rstrip())
return word[0:stripped_length], word[stripped_length:]
def fixup_chunks(chunks):
"""
This function takes a list of chunks and produces a list of tokens.
@@ -580,34 +590,37 @@ def fixup_chunks(chunks):
if isinstance(chunk, tuple):
if chunk[0] == 'img':
src = chunk[1]
tag = chunk[2]
if tag.endswith(' '):
tag = tag[:-1]
trailing_whitespace = True
tag, whitespace = split_trailing_whitespace(chunk[2])
if whitespace:
trailing_whitespace = whitespace
else:
trailing_whitespace = False
trailing_whitespace = None
cur_word = tag_token('img', src, html_repr=tag,
pre_tags=tag_accum,
trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif chunk[0] == 'href':
href = chunk[1]
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
tag_accum = []
result.append(cur_word)
continue
if is_word(chunk):
if chunk.endswith(' '):
chunk = chunk[:-1]
trailing_whitespace = True
chunk, whitespace = split_trailing_whitespace(chunk)
if whitespace:
trailing_whitespace = whitespace
else:
trailing_whitespace = False
trailing_whitespace = None
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
tag_accum = []
result.append(cur_word)
elif is_start_tag(chunk):
tag_accum.append(chunk)
elif is_end_tag(chunk):
if tag_accum:
tag_accum.append(chunk)
@@ -701,14 +714,15 @@ def flatten_el(el, include_hrefs, skip_tag=False):
for word in end_words:
yield html_escape(word)
split_words_re = re.compile(r'\S+(?:\s|$)')
def split_words(text):
""" Splits some text into words. Includes trailing whitespace (one
space) on each word when appropriate. """
if not text or not text.strip():
return []
words = [w + ' ' for w in text.strip().split()]
if not end_whitespace_re.search(text):
words[-1] = words[-1][:-1]
words = split_words_re.findall(text)
return words
start_whitespace_re = re.compile(r'^[ \t\n\r]')

0 comments on commit 44e697f

Please sign in to comment.