Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code parsing for run selection in terminal - Python side #14457

Merged
merged 20 commits into from Oct 23, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions news/2 Fixes/14048.md
@@ -0,0 +1 @@
Update the logic for parsing and sending selected code to the REPL.
201 changes: 76 additions & 125 deletions pythonFiles/normalizeForInterpreter.py
Expand Up @@ -2,142 +2,93 @@
# Licensed under the MIT License.

import ast
import io
import operator
import os
import sys
import textwrap
import token
import tokenize


class Visitor(ast.NodeVisitor):
def __init__(self, lines):
self._lines = lines
self.line_numbers_with_nodes = set()
self.line_numbers_with_statements = []

def generic_visit(self, node):
if (
hasattr(node, "col_offset")
and hasattr(node, "lineno")
and node.col_offset == 0
):
self.line_numbers_with_nodes.add(node.lineno)
if isinstance(node, ast.stmt):
self.line_numbers_with_statements.append(node.lineno)
import sys

ast.NodeVisitor.generic_visit(self, node)

def _get_multiline_statements(selection):
"""
Process a multiline selection into a list of its top-level statements.
This will remove empty newlines around and within the selection, dedent it,
and split it using the result of `ast.parse()`.
"""
statements = []
kimadeline marked this conversation as resolved.
Show resolved Hide resolved

def _tokenize(source):
"""Tokenize Python source code."""
# Using an undocumented API as the documented one in Python 2.7 does not work as needed
# cross-version.
if sys.version_info < (3,) and isinstance(source, str):
source = source.decode()
return tokenize.generate_tokens(io.StringIO(source).readline)
# Remove blank lines within the selection to prevent the REPL from thinking the block is finished.
lines = [line for line in selection.splitlines(False) if line.strip() != ""]
kimadeline marked this conversation as resolved.
Show resolved Hide resolved

# Dedent the selection and parse it using the ast module.
# Note that leading comments in the selection will be discarded during parsing.
source = textwrap.dedent("\n".join(lines))
tree = ast.parse(source)

def _indent_size(line):
for index, char in enumerate(line):
if not char.isspace():
return index
# We'll need the dedented lines to rebuild the selection.
lines = source.splitlines(False)

# Get the line ranges for top-level blocks returned from parsing the dedented text
# and split the selection accordingly.
# tree.body is a list of AST objects, which we rely on to extract top-level statements.
# If we supported Python 3.8+ only we could use the lineno and end_lineno attributes of each object
# to get the boundaries of each block.
# However, earlier Python versions only have the lineno attribute, which is the range start position (1-indexed).
# Therefore, to retrieve the end line of each block in a version-agnostic way we need to do
# `end = next_block.lineno - 1`
# for all blocks except the last one, which will will just run until the last line.
last_idx = len(tree.body) - 1
for idx, node in enumerate(tree.body):
kimadeline marked this conversation as resolved.
Show resolved Hide resolved
# Given this selection:
# if (m > 0 and
# n < 3):
# print('foo')
# value = 'bar'
kimadeline marked this conversation as resolved.
Show resolved Hide resolved
#
# The first block would have lineno = 1,and the second block lineno = 4
start = node.lineno - 1
end = len(lines) if idx == last_idx else tree.body[idx + 1].lineno - 1
block = "\n".join(lines[start:end])

# If the block is multiline, add an extra newline character at its end.
# This way, when joining blocks back together, there will be a blank line between each multiline statement
# and no blank lines between single-line statements, or it would look like this:
# >>> x = 22
# >>>
# >>> y = 30
# >>>
# >>> total = x + y
# >>>
if end - start > 1:
block += "\n"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing that didn't occur to me when we were discussing this, is that for the multiline parentheses case, this newline is redundant, since the closing parenthesis terminates the statement already. So for this input:

x = [
   1
]
y = [
   2
]

The REPL will get a blank line between the two, even though it's not strictly needed:

>>> x = [
...   1
... ]
>>> 
>>> y = [
...   2
...]

I think that's okay, since that extra blank line doesn't change behavior, and appearance is not such a big deal for an uncommon code pattern. But it might be worth pointing out in the comments.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it change the repel default _ by any chance? It should not, but just curious.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it might be worth pointing out in the comments.

I'll add a comment.

Does it change the repel default _ by any chance?

You mean the extra blank statement? Doesn't seem like it:

image


statements.append(block)

return statements


def normalize_lines(selection):
"""
Normalize the text selection received from the extension and send it to the REPL.

If it is a single line selection, dedent it, append a newline and send it to the REPL.
Otherwise, sanitize the multiline selection before sending it to the REPL:
split it in a list of top-level statements
and add newlines between each of them to tell the REPL where each block ends.
"""

def _get_global_statement_blocks(source, lines):
"""Return a list of all global statement blocks.
# Check if it is a singleline or multiline selection.
is_singleline = len(selection.splitlines()) == 1
kimadeline marked this conversation as resolved.
Show resolved Hide resolved

The list comprises of 3-item tuples that contain the starting line number,
ending line number and whether the statement is a single line.
# If it is a single line statement: Dedent and skip to the end.
# Else: Parse the multiline selection into a list of top-level blocks.
if is_singleline:
statements = [textwrap.dedent(selection)]
kimadeline marked this conversation as resolved.
Show resolved Hide resolved
else:
statements = _get_multiline_statements(selection)
kimadeline marked this conversation as resolved.
Show resolved Hide resolved

"""
tree = ast.parse(source)
visitor = Visitor(lines)
visitor.visit(tree)

statement_ranges = []
for index, line_number in enumerate(visitor.line_numbers_with_statements):
remaining_line_numbers = visitor.line_numbers_with_statements[index + 1 :]
end_line_number = (
len(lines)
if len(remaining_line_numbers) == 0
else min(remaining_line_numbers) - 1
)
current_statement_is_oneline = line_number == end_line_number

if len(statement_ranges) == 0:
statement_ranges.append(
(line_number, end_line_number, current_statement_is_oneline)
)
continue

previous_statement = statement_ranges[-1]
previous_statement_is_oneline = previous_statement[2]
if previous_statement_is_oneline and current_statement_is_oneline:
statement_ranges[-1] = previous_statement[0], end_line_number, True
else:
statement_ranges.append(
(line_number, end_line_number, current_statement_is_oneline)
)

return statement_ranges


def normalize_lines(source):
"""Normalize blank lines for sending to the terminal.

Blank lines within a statement block are removed to prevent the REPL
from thinking the block is finished. Newlines are added to separate
top-level statements so that the REPL does not think there is a syntax
error.
# Insert a newline between each top-level statement, and append a newline to the selection.
source = "\n".join(statements) + "\n"

"""
# Ensure to dedent the code (#2837)
lines = textwrap.dedent(source).splitlines(False)
# If we have two blank lines, then add two blank lines.
# Do not trim the spaces, if we have blank lines with spaces, its possible
# we have indented code.
if (len(lines) > 1 and len("".join(lines[-2:])) == 0) or source.endswith(
("\n\n", "\r\n\r\n")
):
trailing_newline = "\n" * 2
# Find out if we have any trailing blank lines
elif len(lines[-1].strip()) == 0 or source.endswith(("\n", "\r\n")):
trailing_newline = "\n"
else:
trailing_newline = ""

# Step 1: Remove empty lines.
tokens = _tokenize(source)
newlines_indexes_to_remove = (
spos[0]
for (toknum, tokval, spos, epos, line) in tokens
if len(line.strip()) == 0
and token.tok_name[toknum] == "NL"
and spos[0] == epos[0]
)

for line_number in reversed(list(newlines_indexes_to_remove)):
del lines[line_number - 1]

# Step 2: Add blank lines between each global statement block.
# A consecutive single lines blocks of code will be treated as a single statement,
# just to ensure we do not unnecessarily add too many blank lines.
source = "\n".join(lines)
tokens = _tokenize(source)
dedent_indexes = (
spos[0]
for (toknum, tokval, spos, epos, line) in tokens
if toknum == token.DEDENT and _indent_size(line) == 0
)

global_statement_ranges = _get_global_statement_blocks(source, lines)
start_positions = map(operator.itemgetter(0), reversed(global_statement_ranges))
for line_number in filter(lambda x: x > 1, start_positions):
lines.insert(line_number - 1, "")

sys.stdout.write("\n".join(lines) + trailing_newline)
# Finally, send the formatted selection to the REPL.
sys.stdout.write(source)
kimadeline marked this conversation as resolved.
Show resolved Hide resolved
sys.stdout.flush()


Expand Down
106 changes: 77 additions & 29 deletions pythonFiles/tests/test_normalize_for_interpreter.py
@@ -1,30 +1,20 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pytest
import sys
import textwrap

import normalizeForInterpreter


class TestNormalizationScript(object):
"""Basic unit tests for the normalization script."""
"""Unit tests for the normalization script."""

@pytest.mark.skipif(
sys.version_info.major == 2,
reason="normalizeForInterpreter not working for 2.7, see GH #4805",
)
def test_basicNormalization(self, capsys):
src = 'print("this is a test")'
expected = src + "\n"
normalizeForInterpreter.normalize_lines(src)
captured = capsys.readouterr()
assert captured.out == src
assert captured.out == expected

@pytest.mark.skipif(
sys.version_info.major == 2,
reason="normalizeForInterpreter not working for 2.7, see GH #4805",
)
def test_moreThanOneLine(self, capsys):
src = textwrap.dedent(
"""\
Expand All @@ -34,14 +24,17 @@ def show_something():
print("Something")
"""
)
expected = textwrap.dedent(
"""\
def show_something():
print("Something")

"""
)
normalizeForInterpreter.normalize_lines(src)
captured = capsys.readouterr()
assert captured.out == src
assert captured.out == expected

@pytest.mark.skipif(
sys.version_info.major == 2,
reason="normalizeForInterpreter not working for 2.7, see GH #4805",
)
def test_withHangingIndent(self, capsys):
src = textwrap.dedent(
"""\
Expand All @@ -54,14 +47,21 @@ def test_withHangingIndent(self, capsys):
print("The answer to life, the universe, and everything")
"""
)
expected = textwrap.dedent(
"""\
x = 22
y = 30
z = -10
result = x + y + z
if result == 42:
print("The answer to life, the universe, and everything")

"""
)
normalizeForInterpreter.normalize_lines(src)
captured = capsys.readouterr()
assert captured.out == src
assert captured.out == expected

@pytest.mark.skipif(
sys.version_info.major == 2,
reason="normalizeForInterpreter not working for 2.7, see GH #4805",
)
def test_clearOutExtraneousNewlines(self, capsys):
src = textwrap.dedent(
"""\
Expand All @@ -81,17 +81,12 @@ def test_clearOutExtraneousNewlines(self, capsys):
value_y = 30
value_z = -10
print(value_x + value_y + value_z)

"""
)
normalizeForInterpreter.normalize_lines(src)
result = capsys.readouterr()
assert result.out == expectedResult

@pytest.mark.skipif(
sys.version_info.major == 2,
reason="normalizeForInterpreter not working for 2.7, see GH #4805",
)
def test_clearOutExtraLinesAndWhitespace(self, capsys):
src = textwrap.dedent(
"""\
Expand All @@ -114,9 +109,62 @@ def test_clearOutExtraLinesAndWhitespace(self, capsys):
z = -10

print(x + y + z)

"""
)
normalizeForInterpreter.normalize_lines(src)
result = capsys.readouterr()
assert result.out == expectedResult

def test_partialSingleLine(self, capsys):
src = " print('foo')"
expected = textwrap.dedent(src) + "\n"
normalizeForInterpreter.normalize_lines(src)
result = capsys.readouterr()
assert result.out == expected

def test_multiLineWithIndent(self, capsys):
src = """\

if (x > 0
and condition == True):
print('foo')
else:

print('bar')
"""

expectedResult = textwrap.dedent(
"""\
if (x > 0
and condition == True):
print('foo')
else:
print('bar')

"""
)

normalizeForInterpreter.normalize_lines(src)
result = capsys.readouterr()
assert result.out == expectedResult

def test_multiLineWithComment(self, capsys):
src = textwrap.dedent(
"""\

def show_something():
# A comment
print("Something")
"""
)
expected = textwrap.dedent(
"""\
def show_something():
# A comment
print("Something")

"""
)
normalizeForInterpreter.normalize_lines(src)
captured = capsys.readouterr()
assert captured.out == expected