Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix block scalar mangling bug #231 #232

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 134 additions & 29 deletions src/yamlfix/adapters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Define adapter / helper classes to hide unrelated functionality in."""

import io
import logging
import re
from functools import partial
Expand All @@ -12,6 +12,7 @@
from ruyaml.tokens import CommentToken

from yamlfix.model import YamlfixConfig, YamlNodeStyle
from yamlfix.util import walk_object

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -334,18 +335,12 @@ def __init__(self, yaml: Yaml, config: Optional[YamlfixConfig]) -> None:
self.yaml = yaml.yaml
self.config = config or YamlfixConfig()

def fix(self, source_code: str) -> str:
"""Run all yaml source code fixers.

Args:
source_code: Source code to be corrected.

Returns:
Corrected source code.
"""
log.debug("Running source code fixers...")

fixers = [
# The list of fixers to run, and the index of the currently called fixer.
# This allows fixers which might need to reinvoke other fixers to reinvoke the
# previous fixers (currently fix comments does this as it needs to re-emit the
# YAML).
self._fixer_idx = -1
self._fixers = [
self._fix_truthy_strings,
self._fix_jinja_variables,
self._ruamel_yaml_fixer,
Expand All @@ -359,8 +354,40 @@ def fix(self, source_code: str) -> str:
self._add_newline_at_end_of_file,
]

for fixer in fixers:
def _rerun_previous_fixers(self, source_code: str) -> str:
"""Run all the previous source code fixers except the currently running one.

This is re-entrant safe and will correctly restore the fixer idx once it's
complete.

Args:
source_code: Source code to be corrected.

Returns:
Corrected source code.
"""
cur_fixer_idx = self._fixer_idx
for fixer_idx, fixer in enumerate(self._fixers[:cur_fixer_idx]):
self._fixer_idx = fixer_idx
source_code = fixer(source_code)
# Restore fixer idx
self._fixer_idx = cur_fixer_idx
return source_code

def fix(self, source_code: str) -> str:
"""Run all yaml source code fixers.

Args:
source_code: Source code to be corrected.

Returns:
Corrected source code.
"""
log.debug("Running source code fixers...")

# Just use the re-run system do it.
self._fixer_idx = len(self._fixers)
source_code = self._rerun_previous_fixers(source_code)

return source_code

Expand Down Expand Up @@ -573,24 +600,102 @@ def _restore_truthy_strings(source_code: str) -> str:
def _fix_comments(self, source_code: str) -> str:
log.debug("Fixing comments...")
config = self.config
comment_start = " " * config.comments_min_spaces_from_content + "#"

fixed_source_lines = []
# We need the source lines for the comment fixers to analyze whitespace easily
source_lines = source_code.splitlines()

for line in source_code.splitlines():
# Comment at the start of the line
if config.comments_require_starting_space and re.search(r"(^|\s)#\w", line):
line = line.replace("#", "# ")
# Comment in the middle of the line, but it's not part of a string
if (
config.comments_min_spaces_from_content > 1
and " #" in line
and line[-1] not in ["'", '"']
):
line = re.sub(r"(.+\S)(\s+?)#", rf"\1{comment_start}", line)
fixed_source_lines.append(line)
yaml = YAML(typ="rt")
# Hijack config options from the regular fixer
yaml.explicit_start = self.yaml.explicit_start
yaml.width = self.yaml.width
# preserve_quotes however must always be true, otherwise we change output
# unexpectedly.
yaml.preserve_quotes = True # type: ignore
yaml_documents = list(yaml.load_all(source_code))

return "\n".join(fixed_source_lines)
handled_comments: List[CommentToken] = []

def _comment_token_cb(target: Any) -> None: # noqa:W0613,ANN401
"""This function is a callback for walk_object.

This implements the actual comment fixing, and is used because comment
objects are structured in nested-lists and can repeat.
"""
if not isinstance(target, CommentToken):
return
if any(target is e for e in handled_comments):
# This comment was handled at a higher level already.
return
if target.value is None:
return
comment_lines = target.value.split("\n")
fixed_comment_lines = []
for line in comment_lines:
if config.comments_require_starting_space and re.search(
r"(^|\s)#\w", line
):
line = line.replace("#", "# ")
fixed_comment_lines.append(line)

# Update the comment with the fixed lines
target.value = "\n".join(fixed_comment_lines)

if config.comments_min_spaces_from_content > 1:
# It's hard to reconstruct exactly where the content is, but since we
# have the line numbers what we do is lookup the literal source line
# here and check where the whitespace is compared to where we know the
# comment starts.
source_line = source_lines[target.start_mark.line]
content_part = source_line[0 : target.start_mark.column] # noqa: E203
# Find the non-whitespace position in the content part
rx_match = re.match(r"^.*\S", content_part)
if (
rx_match is not None
): # If no match then nothing to do - no content to be away from
_, content_end = rx_match.span()
# If there's less than min-spaces from content, we're going to add
# some.
if (
target.start_mark.column - content_end
< config.comments_min_spaces_from_content
):
# Handled
target.start_mark.column = (
content_end + config.comments_min_spaces_from_content
)
# Some ruyaml objects will return attached comments at multiple levels (but
# not all). Keep track of which comments we've already processed to avoid
# double processing them (important because we use raw source lines to
# determine content position above).
handled_comments.append(target)

def _comment_fixer(target: Any) -> None: # noqa:W0613,ANN401
"""This function is a callback for walk_object.

walk_object calls it for every object it finds, and then will walk the
mapping/sequence subvalues and call this function on those too. This gives
us direct access to all round tripped comments.
"""
if not hasattr(target, "ca"):
# Scalar or other object with no comment parameter.
return
# Find all comment tokens and fix them
walk_object(target.ca.comment, _comment_token_cb)
walk_object(target.ca.end, _comment_token_cb)
walk_object(target.ca.items, _comment_token_cb)
walk_object(target.ca.pre, _comment_token_cb)

# Walk the object and invoke the comment fixer
walk_object(yaml_documents, _comment_fixer)

# Dump out the YAML documents
stream = io.StringIO()
yaml.dump_all(yaml_documents, stream=stream)

fixed_source_code = stream.getvalue()
# Reinvoke the previous fixers to ensure we fix the new output we just created.
fixed_source_code = self._rerun_previous_fixers(fixed_source_code)
return fixed_source_code

def _fix_whitelines(self, source_code: str) -> str:
"""Fixes number of consecutive whitelines.
Expand Down
18 changes: 18 additions & 0 deletions src/yamlfix/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Define utility helpers."""
from typing import Any, Callable, Iterable, Mapping


def walk_object(
target: Any, callback_fn: Callable[[Any], None] # noqa: ANN401
) -> None:
"""Walk a YAML/JSON-like object and call a function on all values."""
callback_fn(target) # Call the callback and whatever we received.

if isinstance(target, Mapping):
# Map type
for _, value in target.items():
walk_object(value, callback_fn)
elif isinstance(target, Iterable) and not isinstance(target, (bytes, str)):
# List type
for value in target:
walk_object(value, callback_fn)
48 changes: 48 additions & 0 deletions tests/unit/test_adapter_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,3 +939,51 @@ def test_enforcing_flow_style_together_with_adjustable_newlines(self) -> None:
result = fix_code(source, config)

assert result == fixed_source

def test_block_scalar_whitespace_is_preserved(self) -> None:
"""Check that multi-line blocks with #'s in them are not treated as comments.
Regression test for https://github.com/lyz-code/yamlfix/issues/231
"""
source = dedent(
"""\
---
addn_doc_key: |-
#######################################
# This would also be broken #
#######################################
---
#Comment above the key
key: |-
###########################################
# Value with lots of whitespace #
# Some More Whitespace #
###########################################
#Comment below

#Comment with some whitespace below
""" # noqa: W293
)
fixed_source = dedent(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If fixed_source == source I think it will be cleaner to use result == source. If they are different I was not able to tell them apart :S

"""\
---
addn_doc_key: |-
#######################################
# This would also be broken #
#######################################
---
# Comment above the key
key: |-
###########################################
# Value with lots of whitespace #
# Some More Whitespace #
###########################################
# Comment below

# Comment with some whitespace below
""" # noqa: W293
)
config = YamlfixConfig()

result = fix_code(source, config)

assert result == fixed_source
10 changes: 9 additions & 1 deletion tests/unit/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ def test_fix_code_functions_emit_debug_logs(

fix_code("") # act

expected_logs = {
expected_logs = { # noqa: W0130
"Setting up ruamel yaml 'quote simple values' configuration...",
"Setting up ruamel yaml 'sequence flow style' configuration...",
"Running ruamel yaml base configuration...",
Expand All @@ -481,6 +481,14 @@ def test_fix_code_functions_emit_debug_logs(
"Restoring jinja2 variables...",
"Restoring double exclamations...",
"Fixing comments...",
# Fixing comments causes a re-run of fixers, so we get duplicates from here
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the re-running of previous fixers, can't you fix comments before so that we don't have to rerun everything?

I think it is really bad from the point of view of the performance of the tool to do this.

"Fixing truthy strings...",
"Fixing jinja2 variables...",
"Running ruamel yaml fixer...",
"Restoring truthy strings...",
"Restoring jinja2 variables...",
"Restoring double exclamations...",
# End fixing comments duplicates
"Fixing top level lists...",
"Fixing flow-style lists...",
}
Expand Down