Use Unicode-standard char width to wrap comments or strings. (#3275)

rust-lang · Jan 14, 2019 · a01990c · a01990c
1 parent 503cdde
commit a01990c
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 59 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -53,6 +53,8 @@ rustc-ap-syntax = "306.0.0"
 rustc-ap-syntax_pos = "306.0.0"
 failure = "0.1.1"
 bytecount = "0.4"
+unicode-width = "0.1.5"
+unicode_categories = "0.1.1"
 
 # A noop dependency that changes in the Rust repository, it's a bit of a hack.
 # See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust`

diff --git a/src/comment.rs b/src/comment.rs
@@ -19,7 +19,9 @@ use config::Config;
 use rewrite::RewriteContext;
 use shape::{Indent, Shape};
 use string::{rewrite_string, StringFormat};
-use utils::{count_newlines, first_line_width, last_line_width, trim_left_preserve_layout};
+use utils::{
+    count_newlines, first_line_width, last_line_width, trim_left_preserve_layout, unicode_str_width,
+};
 use {ErrorKind, FormattingError};
 
 fn is_custom_comment(comment: &str) -> bool {
@@ -264,7 +266,8 @@ fn identify_comment(
 ) -> Option<String> {
     let style = comment_style(orig, false);
 
-    // Computes the len of line taking into account a newline if the line is part of a paragraph.
+    // Computes the byte length of line taking into account a newline if the line is part of a
+    // paragraph.
     fn compute_len(orig: &str, line: &str) -> usize {
         if orig.len() > line.len() {
             if orig.as_bytes()[line.len()] == b'\r' {
@@ -498,7 +501,7 @@ struct CommentRewrite<'a> {
     item_block: Option<ItemizedBlock>,
     comment_line_separator: String,
     indent_str: String,
-    max_chars: usize,
+    max_width: usize,
     fmt_indent: Indent,
     fmt: StringFormat<'a>,
 
@@ -520,7 +523,7 @@ impl<'a> CommentRewrite<'a> {
             comment_style(orig, config.normalize_comments()).to_str_tuplet()
         };
 
-        let max_chars = shape
+        let max_width = shape
             .width
             .checked_sub(closer.len() + opener.len())
             .unwrap_or(1);
@@ -534,7 +537,7 @@ impl<'a> CommentRewrite<'a> {
             code_block_attr: None,
             item_block: None,
             comment_line_separator: format!("{}{}", indent_str, line_start),
-            max_chars,
+            max_width,
             indent_str,
             fmt_indent,
 
@@ -543,7 +546,7 @@ impl<'a> CommentRewrite<'a> {
                 closer: "",
                 line_start,
                 line_end: "",
-                shape: Shape::legacy(max_chars, fmt_indent),
+                shape: Shape::legacy(max_width, fmt_indent),
                 trim_end: true,
                 config,
             },
@@ -583,14 +586,14 @@ impl<'a> CommentRewrite<'a> {
 
         if let Some(ref ib) = self.item_block {
             // the last few lines are part of an itemized block
-            self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
+            self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
             let item_fmt = ib.create_string_format(&self.fmt);
             self.result.push_str(&self.comment_line_separator);
             self.result.push_str(&ib.opener);
             match rewrite_string(
                 &ib.trimmed_block_as_string(),
                 &item_fmt,
-                self.max_chars.saturating_sub(ib.indent),
+                self.max_width.saturating_sub(ib.indent),
             ) {
                 Some(s) => self.result.push_str(&Self::join_block(
                     &s,
@@ -626,14 +629,14 @@ impl<'a> CommentRewrite<'a> {
                 return false;
             }
             self.is_prev_line_multi_line = false;
-            self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
+            self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
             let item_fmt = ib.create_string_format(&self.fmt);
             self.result.push_str(&self.comment_line_separator);
             self.result.push_str(&ib.opener);
             match rewrite_string(
                 &ib.trimmed_block_as_string(),
                 &item_fmt,
-                self.max_chars.saturating_sub(ib.indent),
+                self.max_width.saturating_sub(ib.indent),
             ) {
                 Some(s) => self.result.push_str(&Self::join_block(
                     &s,
@@ -710,8 +713,11 @@ impl<'a> CommentRewrite<'a> {
             }
         }
 
-        if self.fmt.config.wrap_comments() && line.len() > self.fmt.shape.width && !has_url(line) {
-            match rewrite_string(line, &self.fmt, self.max_chars) {
+        if self.fmt.config.wrap_comments()
+            && unicode_str_width(line) > self.fmt.shape.width
+            && !has_url(line)
+        {
+            match rewrite_string(line, &self.fmt, self.max_width) {
                 Some(ref s) => {
                     self.is_prev_line_multi_line = s.contains('\n');
                     self.result.push_str(s);
@@ -721,8 +727,8 @@ impl<'a> CommentRewrite<'a> {
                     // Remove the trailing space, then start rewrite on the next line.
                     self.result.pop();
                     self.result.push_str(&self.comment_line_separator);
-                    self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
-                    match rewrite_string(line, &self.fmt, self.max_chars) {
+                    self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
+                    match rewrite_string(line, &self.fmt, self.max_width) {
                         Some(ref s) => {
                             self.is_prev_line_multi_line = s.contains('\n');
                             self.result.push_str(s);
@@ -743,20 +749,20 @@ impl<'a> CommentRewrite<'a> {
                 // 1 = " "
                 let offset = 1 + last_line_width(&self.result) - self.line_start.len();
                 Shape {
-                    width: self.max_chars.saturating_sub(offset),
+                    width: self.max_width.saturating_sub(offset),
                     indent: self.fmt_indent,
                     offset: self.fmt.shape.offset + offset,
                 }
             } else {
-                Shape::legacy(self.max_chars, self.fmt_indent)
+                Shape::legacy(self.max_width, self.fmt_indent)
             };
         } else {
             if line.is_empty() && self.result.ends_with(' ') && !is_last {
                 // Remove space if this is an empty comment or a doc comment.
                 self.result.pop();
             }
             self.result.push_str(line);
-            self.fmt.shape = Shape::legacy(self.max_chars, self.fmt_indent);
+            self.fmt.shape = Shape::legacy(self.max_width, self.fmt_indent);
             self.is_prev_line_multi_line = false;
         }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -29,7 +29,9 @@ extern crate serde_json;
 extern crate syntax;
 extern crate syntax_pos;
 extern crate toml;
+extern crate unicode_categories;
 extern crate unicode_segmentation;
+extern crate unicode_width;
 
 use std::cell::RefCell;
 use std::collections::HashMap;

diff --git a/src/overflow.rs b/src/overflow.rs
@@ -431,7 +431,8 @@ impl<'a> Context<'a> {
         };
 
         if let Some(rewrite) = rewrite {
-            let rewrite_first_line = Some(rewrite[..first_line_width(&rewrite)].to_owned());
+            // splitn(2, *).next().unwrap() is always safe.
+            let rewrite_first_line = Some(rewrite.splitn(2, '\n').next().unwrap().to_owned());
             last_list_item.item = rewrite_first_line;
             Some(rewrite)
         } else {

diff --git a/src/string.rs b/src/string.rs
@@ -11,11 +11,12 @@
 // Format string literals.
 
 use regex::Regex;
+use unicode_categories::UnicodeCategories;
 use unicode_segmentation::UnicodeSegmentation;
 
 use config::Config;
 use shape::Shape;
-use utils::wrap_str;
+use utils::{unicode_str_width, wrap_str};
 
 const MIN_STRING: usize = 10;
 
@@ -53,7 +54,7 @@ impl<'a> StringFormat<'a> {
     /// indentation into account.
     ///
     /// If we cannot put at least a single character per line, the rewrite won't succeed.
-    fn max_chars_with_indent(&self) -> Option<usize> {
+    fn max_width_with_indent(&self) -> Option<usize> {
         Some(
             self.shape
                 .width
@@ -62,10 +63,10 @@ impl<'a> StringFormat<'a> {
         )
     }
 
-    /// Like max_chars_with_indent but the indentation is not subtracted.
+    /// Like max_width_with_indent but the indentation is not subtracted.
     /// This allows to fit more graphemes from the string on a line when
     /// SnippetState::EndWithLineFeed.
-    fn max_chars_without_indent(&self) -> Option<usize> {
+    fn max_width_without_indent(&self) -> Option<usize> {
         Some(self.config.max_width().checked_sub(self.line_end.len())?)
     }
 }
@@ -75,8 +76,8 @@ pub fn rewrite_string<'a>(
     fmt: &StringFormat<'a>,
     newline_max_chars: usize,
 ) -> Option<String> {
-    let max_chars_with_indent = fmt.max_chars_with_indent()?;
-    let max_chars_without_indent = fmt.max_chars_without_indent()?;
+    let max_width_with_indent = fmt.max_width_with_indent()?;
+    let max_width_without_indent = fmt.max_width_without_indent()?;
     let indent_with_newline = fmt.shape.indent.to_string_with_newline(fmt.config);
     let indent_without_newline = fmt.shape.indent.to_string(fmt.config);
 
@@ -99,11 +100,11 @@ pub fn rewrite_string<'a>(
 
     // Snip a line at a time from `stripped_str` until it is used up. Push the snippet
     // onto result.
-    let mut cur_max_chars = max_chars_with_indent;
+    let mut cur_max_width = max_width_with_indent;
     let is_bareline_ok = fmt.line_start.is_empty() || is_whitespace(fmt.line_start);
     loop {
         // All the input starting at cur_start fits on the current line
-        if graphemes.len() - cur_start <= cur_max_chars {
+        if graphemes_width(&graphemes[cur_start..]) <= cur_max_width {
             for (i, grapheme) in graphemes[cur_start..].iter().enumerate() {
                 if is_new_line(grapheme) {
                     // take care of blank lines
@@ -123,7 +124,7 @@ pub fn rewrite_string<'a>(
 
         // The input starting at cur_start needs to be broken
         match break_string(
-            cur_max_chars,
+            cur_max_width,
             fmt.trim_end,
             fmt.line_end,
             &graphemes[cur_start..],
@@ -133,7 +134,7 @@ pub fn rewrite_string<'a>(
                 result.push_str(fmt.line_end);
                 result.push_str(&indent_with_newline);
                 result.push_str(fmt.line_start);
-                cur_max_chars = newline_max_chars;
+                cur_max_width = newline_max_chars;
                 cur_start += len;
             }
             SnippetState::EndWithLineFeed(line, len) => {
@@ -143,11 +144,11 @@ pub fn rewrite_string<'a>(
                 result.push_str(&line);
                 if is_bareline_ok {
                     // the next line can benefit from the full width
-                    cur_max_chars = max_chars_without_indent;
+                    cur_max_width = max_width_without_indent;
                 } else {
                     result.push_str(&indent_without_newline);
                     result.push_str(fmt.line_start);
-                    cur_max_chars = max_chars_with_indent;
+                    cur_max_width = max_width_with_indent;
                 }
                 cur_start += len;
             }
@@ -226,9 +227,10 @@ fn not_whitespace_except_line_feed(g: &str) -> bool {
     is_new_line(g) || !is_whitespace(g)
 }
 
-/// Break the input string at a boundary character around the offset `max_chars`. A boundary
+/// Break the input string at a boundary character around the offset `max_width`. A boundary
 /// character is either a punctuation or a whitespace.
-fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
+/// FIXME(issue#3281): We must follow UAX#14 algorithm instead of this.
+fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]) -> SnippetState {
     let break_at = |index /* grapheme at index is included */| {
         // Take in any whitespaces to the left/right of `input[index]` while
         // preserving line feeds
@@ -272,19 +274,33 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
         }
     };
 
+    // find a first index where the unicode width of input[0..x] become > max_width
+    let max_width_index_in_input = {
+        let mut cur_width = 0;
+        let mut cur_index = 0;
+        for (i, grapheme) in input.iter().enumerate() {
+            cur_width += unicode_str_width(grapheme);
+            cur_index = i;
+            if cur_width > max_width {
+                break;
+            }
+        }
+        cur_index
+    };
+
     // Find the position in input for breaking the string
     if line_end.is_empty()
         && trim_end
-        && !is_whitespace(input[max_chars - 1])
-        && is_whitespace(input[max_chars])
+        && !is_whitespace(input[max_width_index_in_input - 1])
+        && is_whitespace(input[max_width_index_in_input])
     {
         // At a breaking point already
         // The line won't invalidate the rewriting because:
         // - no extra space needed for the line_end character
         // - extra whitespaces to the right can be trimmed
-        return break_at(max_chars - 1);
+        return break_at(max_width_index_in_input - 1);
     }
-    if let Some(url_index_end) = detect_url(input, max_chars) {
+    if let Some(url_index_end) = detect_url(input, max_width_index_in_input) {
         let index_plus_ws = url_index_end
             + input[url_index_end..]
                 .iter()
@@ -297,27 +313,28 @@ fn break_string(max_chars: usize, trim_end: bool, line_end: &str, input: &[&str]
             return SnippetState::LineEnd(input[..=index_plus_ws].concat(), index_plus_ws + 1);
         };
     }
-    match input[0..max_chars]
+
+    match input[0..max_width_index_in_input]
         .iter()
         .rposition(|grapheme| is_whitespace(grapheme))
     {
         // Found a whitespace and what is on its left side is big enough.
         Some(index) if index >= MIN_STRING => break_at(index),
         // No whitespace found, try looking for a punctuation instead
-        _ => match input[0..max_chars]
+        _ => match input[0..max_width_index_in_input]
             .iter()
             .rposition(|grapheme| is_punctuation(grapheme))
         {
             // Found a punctuation and what is on its left side is big enough.
             Some(index) if index >= MIN_STRING => break_at(index),
             // Either no boundary character was found to the left of `input[max_chars]`, or the line
             // got too small. We try searching for a boundary character to the right.
-            _ => match input[max_chars..]
+            _ => match input[max_width_index_in_input..]
                 .iter()
                 .position(|grapheme| is_whitespace(grapheme) || is_punctuation(grapheme))
             {
                 // A boundary was found after the line limit
-                Some(index) => break_at(max_chars + index),
+                Some(index) => break_at(max_width_index_in_input + index),
                 // No boundary to the right, the input cannot be broken
                 None => SnippetState::EndOfInput(input.concat()),
             },
@@ -335,10 +352,11 @@ fn is_whitespace(grapheme: &str) -> bool {
 }
 
 fn is_punctuation(grapheme: &str) -> bool {
-    match grapheme.as_bytes()[0] {
-        b':' | b',' | b';' | b'.' => true,
-        _ => false,
-    }
+    grapheme.chars().all(|c| c.is_punctuation_other())
+}
+
+fn graphemes_width(graphemes: &[&str]) -> usize {
+    graphemes.iter().map(|s| unicode_str_width(s)).sum()
 }
 
 #[cfg(test)]