Skip to content

Commit

Permalink
Use Unicode-standard char width to wrap comments or strings. (#3275)
Browse files Browse the repository at this point in the history
  • Loading branch information
wada314 authored and topecongiro committed Jan 14, 2019
1 parent 503cdde commit a01990c
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 59 deletions.
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Expand Up @@ -53,6 +53,8 @@ rustc-ap-syntax = "306.0.0"
rustc-ap-syntax_pos = "306.0.0"
failure = "0.1.1"
bytecount = "0.4"
unicode-width = "0.1.5"
unicode_categories = "0.1.1"

# A noop dependency that changes in the Rust repository, it's a bit of a hack.
# See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust`
Expand Down
40 changes: 23 additions & 17 deletions src/comment.rs
Expand Up @@ -19,7 +19,9 @@ use config::Config;
use rewrite::RewriteContext;
use shape::{Indent, Shape};
use string::{rewrite_string, StringFormat};
use utils::{count_newlines, first_line_width, last_line_width, trim_left_preserve_layout};
use utils::{
count_newlines, first_line_width, last_line_width, trim_left_preserve_layout, unicode_str_width,
};
use {ErrorKind, FormattingError};

fn is_custom_comment(comment: &str) -> bool {
Expand Down Expand Up @@ -264,7 +266,8 @@ fn identify_comment(
) -> Option<String> {
let style = comment_style(orig, false);

// Computes the len of line taking into account a newline if the line is part of a paragraph.
// Computes the byte length of line taking into account a newline if the line is part of a
// paragraph.
fn compute_len(orig: &str, line: &str) -> usize {
if orig.len() > line.len() {
if orig.as_bytes()[line.len()] == b'\r' {
Expand Down Expand Up @@ -498,7 +501,7 @@ struct CommentRewrite<'a> {
item_block: Option<ItemizedBlock>,
comment_line_separator: String,
indent_str: String,
max_chars: usize,
max_width: usize,
fmt_indent: Indent,
fmt: StringFormat<'a>,

Expand All @@ -520,7 +523,7 @@ impl<'a> CommentRewrite<'a> {
comment_style(orig, config.normalize_comments()).to_str_tuplet()
};

let max_chars = shape
let max_width = shape
.width
.checked_sub(closer.len() + opener.len())
.unwrap_or(1);
Expand All @@ -534,7 +537,7 @@ impl<'a> CommentRewrite<'a> {
code_block_attr: None,
item_block: None,
comment_line_separator: format!("{}{}", indent_str, line_start),
max_chars,
max_width,
indent_str,
fmt_indent,

Expand All @@ -543,7 +546,7 @@ impl<'a> CommentRewrite<'a> {
closer: "",
line_start,
line_end: "",
shape: Shape::legacy(max_chars, fmt_indent),
shape: Shape::legacy(max_width, fmt_indent),
trim_end: true,
config,
},
Expand Down Expand Up @@ -583,14 +586,14 @@ impl<'a> CommentRewrite<'a> {

if let Some(ref ib) = self.item_block {
// the last few lines are part of an itemized block
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
let item_fmt = ib.create_string_format(&self.fmt);
self.result.push_str(&self.comment_line_separator);
self.result.push_str(&ib.opener);
match rewrite_string(
&ib.trimmed_block_as_string(),
&item_fmt,
self.max_chars.saturating_sub(ib.indent),
self.max_width.saturating_sub(ib.indent),
) {
Some(s) => self.result.push_str(&Self::join_block(
&s,
Expand Down Expand Up @@ -626,14 +629,14 @@ impl<'a> CommentRewrite<'a> {
return false;
}
self.is_prev_line_multi_line = false;
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
let item_fmt = ib.create_string_format(&self.fmt);
self.result.push_str(&self.comment_line_separator);
self.result.push_str(&ib.opener);
match rewrite_string(
&ib.trimmed_block_as_string(),
&item_fmt,
self.max_chars.saturating_sub(ib.indent),
self.max_width.saturating_sub(ib.indent),
) {
Some(s) => self.result.push_str(&Self::join_block(
&s,
Expand Down Expand Up @@ -710,8 +713,11 @@ impl<'a> CommentRewrite<'a> {
}
}

if self.fmt.config.wrap_comments() && line.len() > self.fmt.shape.width && !has_url(line) {
match rewrite_string(line, &self.fmt, self.max_chars) {
if self.fmt.config.wrap_comments()
&& unicode_str_width(line) > self.fmt.shape.width
&& !has_url(line)
{
match rewrite_string(line, &self.fmt, self.max_width) {
Some(ref s) => {
self.is_prev_line_multi_line = s.contains('\n');
self.result.push_str(s);
Expand All @@ -721,8 +727,8 @@ impl<'a> CommentRewrite<'a> {
// Remove the trailing space, then start rewrite on the next line.
self.result.pop();
self.result.push_str(&self.comment_line_separator);
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
match rewrite_string(line, &self.fmt, self.max_chars) {
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
match rewrite_string(line, &self.fmt, self.max_width) {
Some(ref s) => {
self.is_prev_line_multi_line = s.contains('\n');
self.result.push_str(s);
Expand All @@ -743,20 +749,20 @@ impl<'a> CommentRewrite<'a> {
// 1 = " "
let offset = 1 + last_line_width(&self.result) - self.line_start.len();
Shape {
width: self.max_chars.saturating_sub(offset),
width: self.max_width.saturating_sub(offset),
indent: self.fmt_indent,
offset: self.fmt.shape.offset + offset,
}
} else {
Shape::legacy(self.max_chars, self.fmt_indent)
Shape::legacy(self.max_width, self.fmt_indent)
};
} else {
if line.is_empty() && self.result.ends_with(' ') && !is_last {
// Remove space if this is an empty comment or a doc comment.
self.result.pop();
}
self.result.push_str(line);
self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
self.is_prev_line_multi_line = false;
}

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Expand Up @@ -29,7 +29,9 @@ extern crate serde_json;
extern crate syntax;
extern crate syntax_pos;
extern crate toml;
extern crate unicode_categories;
extern crate unicode_segmentation;
extern crate unicode_width;

use std::cell::RefCell;
use std::collections::HashMap;
Expand Down
3 changes: 2 additions & 1 deletion src/overflow.rs
Expand Up @@ -431,7 +431,8 @@ impl<'a> Context<'a> {
};

if let Some(rewrite) = rewrite {
let rewrite_first_line = Some(rewrite[..first_line_width(&rewrite)].to_owned());
// splitn(2, *).next().unwrap() is always safe.
let rewrite_first_line = Some(rewrite.splitn(2, '\n').next().unwrap().to_owned());
last_list_item.item = rewrite_first_line;
Some(rewrite)
} else {
Expand Down
70 changes: 44 additions & 26 deletions src/string.rs
Expand Up @@ -11,11 +11,12 @@
// Format string literals.

use regex::Regex;
use unicode_categories::UnicodeCategories;
use unicode_segmentation::UnicodeSegmentation;

use config::Config;
use shape::Shape;
use utils::wrap_str;
use utils::{unicode_str_width, wrap_str};

const MIN_STRING: usize = 10;

Expand Down Expand Up @@ -53,7 +54,7 @@ impl<'a> StringFormat<'a> {
/// indentation into account.
///
/// If we cannot put at least a single character per line, the rewrite won't succeed.
fn max_chars_with_indent(&self) -> Option<usize> {
fn max_width_with_indent(&self) -> Option<usize> {
Some(
self.shape
.width
Expand All @@ -62,10 +63,10 @@ impl<'a> StringFormat<'a> {
)
}

/// Like max_chars_with_indent but the indentation is not subtracted.
/// Like max_width_with_indent but the indentation is not subtracted.
/// This allows to fit more graphemes from the string on a line when
/// SnippetState::EndWithLineFeed.
fn max_chars_without_indent(&self) -> Option<usize> {
fn max_width_without_indent(&self) -> Option<usize> {
Some(self.config.max_width().checked_sub(self.line_end.len())?)
}
}
Expand All @@ -75,8 +76,8 @@ pub fn rewrite_string<'a>(
fmt: &StringFormat<'a>,
newline_max_chars: usize,
) -> Option<String> {
let max_chars_with_indent = fmt.max_chars_with_indent()?;
let max_chars_without_indent = fmt.max_chars_without_indent()?;
let max_width_with_indent = fmt.max_width_with_indent()?;
let max_width_without_indent = fmt.max_width_without_indent()?;
let indent_with_newline = fmt.shape.indent.to_string_with_newline(fmt.config);
let indent_without_newline = fmt.shape.indent.to_string(fmt.config);

Expand All @@ -99,11 +100,11 @@ pub fn rewrite_string<'a>(

// Snip a line at a time from `stripped_str` until it is used up. Push the snippet
// onto result.
let mut cur_max_chars = max_chars_with_indent;
let mut cur_max_width = max_width_with_indent;
let is_bareline_ok = fmt.line_start.is_empty() || is_whitespace(fmt.line_start);
loop {
// All the input starting at cur_start fits on the current line
if graphemes.len() - cur_start <= cur_max_chars {
if graphemes_width(&graphemes[cur_start..]) <= cur_max_width {
for (i, grapheme) in graphemes[cur_start..].iter().enumerate() {
if is_new_line(grapheme) {
// take care of blank lines
Expand All @@ -123,7 +124,7 @@ pub fn rewrite_string<'a>(

// The input starting at cur_start needs to be broken
match break_string(
cur_max_chars,
cur_max_width,
fmt.trim_end,
fmt.line_end,
&graphemes[cur_start..],
Expand All @@ -133,7 +134,7 @@ pub fn rewrite_string<'a>(
result.push_str(fmt.line_end);
result.push_str(&indent_with_newline);
result.push_str(fmt.line_start);
cur_max_chars = newline_max_chars;
cur_max_width = newline_max_chars;
cur_start += len;
}
SnippetState::EndWithLineFeed(line, len) => {
Expand All @@ -143,11 +144,11 @@ pub fn rewrite_string<'a>(
result.push_str(&line);
if is_bareline_ok {
// the next line can benefit from the full width
cur_max_chars = max_chars_without_indent;
cur_max_width = max_width_without_indent;
} else {
result.push_str(&indent_without_newline);
result.push_str(fmt.line_start);
cur_max_chars = max_chars_with_indent;
cur_max_width = max_width_with_indent;
}
cur_start += len;
}
Expand Down Expand Up @@ -226,9 +227,10 @@ fn not_whitespace_except_line_feed(g: &str) -> bool {
is_new_line(g) || !is_whitespace(g)
}

/// Break the input string at a boundary character around the offset `max_chars`. A boundary
/// Break the input string at a boundary character around the offset `max_width`. A boundary
/// character is either a punctuation or a whitespace.
fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
/// FIXME(issue#3281): We must follow UAX#14 algorithm instead of this.
fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
let break_at = |index /* grapheme at index is included */| {
// Take in any whitespaces to the left/right of `input[index]` while
// preserving line feeds
Expand Down Expand Up @@ -272,19 +274,33 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
}
};

// find a first index where the unicode width of input[0..x] become > max_width
let max_width_index_in_input = {
let mut cur_width = 0;
let mut cur_index = 0;
for (i, grapheme) in input.iter().enumerate() {
cur_width += unicode_str_width(grapheme);
cur_index = i;
if cur_width > max_width {
break;
}
}
cur_index
};

// Find the position in input for breaking the string
if line_end.is_empty()
&& trim_end
&& !is_whitespace(input[max_chars - 1])
&& is_whitespace(input[max_chars])
&& !is_whitespace(input[max_width_index_in_input - 1])
&& is_whitespace(input[max_width_index_in_input])
{
// At a breaking point already
// The line won't invalidate the rewriting because:
// - no extra space needed for the line_end character
// - extra whitespaces to the right can be trimmed
return break_at(max_chars - 1);
return break_at(max_width_index_in_input - 1);
}
if let Some(url_index_end) = detect_url(input, max_chars) {
if let Some(url_index_end) = detect_url(input, max_width_index_in_input) {
let index_plus_ws = url_index_end
+ input[url_index_end..]
.iter()
Expand All @@ -297,27 +313,28 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
return SnippetState::LineEnd(input[..=index_plus_ws].concat(), index_plus_ws + 1);
};
}
match input[0..max_chars]

match input[0..max_width_index_in_input]
.iter()
.rposition(|grapheme| is_whitespace(grapheme))
{
// Found a whitespace and what is on its left side is big enough.
Some(index) if index >= MIN_STRING => break_at(index),
// No whitespace found, try looking for a punctuation instead
_ => match input[0..max_chars]
_ => match input[0..max_width_index_in_input]
.iter()
.rposition(|grapheme| is_punctuation(grapheme))
{
// Found a punctuation and what is on its left side is big enough.
Some(index) if index >= MIN_STRING => break_at(index),
// Either no boundary character was found to the left of `input[max_chars]`, or the line
// got too small. We try searching for a boundary character to the right.
_ => match input[max_chars..]
_ => match input[max_width_index_in_input..]
.iter()
.position(|grapheme| is_whitespace(grapheme) || is_punctuation(grapheme))
{
// A boundary was found after the line limit
Some(index) => break_at(max_chars + index),
Some(index) => break_at(max_width_index_in_input + index),
// No boundary to the right, the input cannot be broken
None => SnippetState::EndOfInput(input.concat()),
},
Expand All @@ -335,10 +352,11 @@ fn is_whitespace(grapheme: &str) -> bool {
}

fn is_punctuation(grapheme: &str) -> bool {
match grapheme.as_bytes()[0] {
b':' | b',' | b';' | b'.' => true,
_ => false,
}
grapheme.chars().all(|c| c.is_punctuation_other())
}

fn graphemes_width(graphemes: &[&str]) -> usize {
graphemes.iter().map(|s| unicode_str_width(s)).sum()
}

#[cfg(test)]
Expand Down

0 comments on commit a01990c

Please sign in to comment.