Skip to content
This repository has been archived by the owner on Nov 10, 2020. It is now read-only.

Commit

Permalink
Backout "smart" unicode functions
Browse files Browse the repository at this point in the history
These function should *not* use the unicode tests, because they work on
bytes. This can (and has) lead to thinking something is a "\r" while it
is in fact part of a larger (2 byte) character and, thus, is in fact
something completely different.
  • Loading branch information
miekg committed Feb 27, 2016
1 parent 4c1c004 commit 001cc53
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 47 deletions.
59 changes: 29 additions & 30 deletions block.go
Expand Up @@ -330,7 +330,7 @@ func (p *parser) isPrefixHeader(data []byte) bool {
for level < 6 && data[level] == '#' {
level++
}
if !iswhitespace(data[level]) {
if data[level] != ' ' {
return false
}
}
Expand All @@ -353,7 +353,7 @@ func (p *parser) prefixHeader(out *bytes.Buffer, data []byte) int {
level++
}
i, end := 0, 0
for i = level; iswhitespace(data[i]); i++ {
for i = level; data[i] == ' '; i++ {
}
for end = i; data[end] != '\n'; end++ {
}
Expand Down Expand Up @@ -383,13 +383,13 @@ func (p *parser) prefixHeader(out *bytes.Buffer, data []byte) int {
for end > 0 && data[end-1] == '#' {
// CommonMark: a # directly following the header name is allowed and we
// should keep it
if end > 1 && data[end-2] != '#' && !iswhitespace(data[end-2]) {
if end > 1 && data[end-2] != '#' && data[end-2] != ' ' {
end++
break
}
end--
}
for end > 0 && iswhitespace(data[end-1]) {
for end > 0 && data[end-1] == ' ' {
end--
}
if end > i {
Expand Down Expand Up @@ -425,7 +425,7 @@ func (p *parser) isUnderlinedHeader(data []byte) int {
for data[i] == '=' {
i++
}
for iswhitespace(data[i]) {
for data[i] == ' ' {
i++
}
if data[i] == '\n' {
Expand All @@ -440,7 +440,7 @@ func (p *parser) isUnderlinedHeader(data []byte) int {
for data[i] == '-' {
i++
}
for iswhitespace(data[i]) {
for data[i] == ' ' {
i++
}
if data[i] == '\n' {
Expand Down Expand Up @@ -470,7 +470,7 @@ func (p *parser) isPartHeader(data []byte) bool {
}

if p.flags&EXTENSION_SPACE_HEADERS != 0 {
if !iswhitespace(data[2]) {
if data[2] != ' ' {
return false
}
}
Expand All @@ -495,7 +495,7 @@ func (p *parser) isSpecialHeader(data []byte) bool {
}

if p.flags&EXTENSION_SPACE_HEADERS != 0 {
if !iswhitespace(data[2]) {
if data[2] != ' ' {
return false
}
}
Expand All @@ -520,7 +520,7 @@ func (p *parser) specialHeader(out *bytes.Buffer, data []byte) int {
}

i, end := 0, 0
for i = 2; iswhitespace(data[i]); i++ {
for i = 2; data[i] == ' '; i++ {
}
for end = i; data[end] != '\n'; end++ {
}
Expand Down Expand Up @@ -552,13 +552,13 @@ func (p *parser) specialHeader(out *bytes.Buffer, data []byte) int {
for end > 0 && data[end-1] == '#' {
// CommonMark: a # directly following the header name is allowed and we
// should keep it
if end > 1 && data[end-2] != '#' && !iswhitespace(data[end-2]) {
if end > 1 && data[end-2] != '#' && data[end-2] != ' ' {
end++
break
}
end--
}
for end > 0 && iswhitespace(data[end-1]) {
for end > 0 && data[end-1] == ' ' {
end--
}
if end > i {
Expand Down Expand Up @@ -615,7 +615,7 @@ func (p *parser) partHeader(out *bytes.Buffer, data []byte) int {
}

i, end := 0, 0
for i = 2; iswhitespace(data[i]); i++ {
for i = 2; data[i] == ' '; i++ {
}
for end = i; data[end] != '\n'; end++ {
}
Expand Down Expand Up @@ -646,13 +646,13 @@ func (p *parser) partHeader(out *bytes.Buffer, data []byte) int {
for end > 0 && data[end-1] == '#' {
// CommonMark: a # directly following the header name is allowed and we
// should keep it
if end > 1 && data[end-2] != '#' && !iswhitespace(data[end-2]) {
if end > 1 && data[end-2] != '#' && data[end-2] != ' ' {
end++
break
}
end--
}
for end > 0 && iswhitespace(data[end-1]) {
for end > 0 && data[end-1] == ' ' {
end--
}
if end > i {
Expand Down Expand Up @@ -1824,8 +1824,7 @@ func (p *parser) uliPrefix(data []byte) int {
}

// need a *, +, #, or - followed by a space
if (data[i] != '*' && data[i] != '+' && data[i] != '-' && !iswhitespace(data[i])) ||
!iswhitespace(data[i+1]) {
if (data[i] != '*' && data[i] != '+' && data[i] != '-' && data[i] != ' ') || data[i+1] != ' ' {
return 0
}
return i + 2
Expand All @@ -1850,7 +1849,7 @@ func (p *parser) oliPrefix(data []byte) int {
}

// we need >= 1 digits followed by a dot or brace and a space
if start == i || (data[i] != '.' && data[i] != ')') || !iswhitespace(data[i+1]) {
if start == i || (data[i] != '.' && data[i] != ')') || data[i+1] != ' ' {
return 0
}
return i + 2
Expand All @@ -1875,7 +1874,7 @@ func (p *parser) aliPrefix(data []byte) int {
}

// we need >= 1 letter followed by a dot and two spaces
if start == i || (data[i] != '.' && data[i] != ')') || !iswhitespace(data[i+1]) || !iswhitespace(data[i+2]) {
if start == i || (data[i] != '.' && data[i] != ')') || data[i+1] != ' ' || data[i+2] != ' ' {
return 0
}
if i-start > 2 {
Expand Down Expand Up @@ -1905,7 +1904,7 @@ func (p *parser) aliPrefixU(data []byte) int {
}

// we need >= 1 letter followed by a dot and two spaces
if start == i || (data[i] != '.' && data[i] != ')') || !iswhitespace(data[i+1]) || !iswhitespace(data[i+2]) {
if start == i || (data[i] != '.' && data[i] != ')') || data[i+1] != ' ' || data[i+2] != ' ' {
return 0
}
if i-start > 2 {
Expand Down Expand Up @@ -1934,7 +1933,7 @@ func (p *parser) rliPrefix(data []byte) int {
}

// we need >= 1 letter followed by a dot and two spaces
if start == i || (data[i] != '.' && data[i] != ')') || !iswhitespace(data[i+1]) || !iswhitespace(data[i+2]) {
if start == i || (data[i] != '.' && data[i] != ')') || data[i+1] != ' ' || data[i+2] != ' ' {
return 0
}
return i + 3
Expand All @@ -1959,7 +1958,7 @@ func (p *parser) rliPrefixU(data []byte) int {
}

// we need >= 1 letter followed by a dot and two spaces
if start == i || (data[i] != '.' && data[i] != ')') || !iswhitespace(data[i+1]) || !iswhitespace(data[i+2]) {
if start == i || (data[i] != '.' && data[i] != ')') || data[i+1] != ' ' || data[i+2] != ' ' {
return 0
}
return i + 3
Expand All @@ -1983,7 +1982,7 @@ func (p *parser) dliPrefix(data []byte) int {

// start with up to 3 spaces before :
j := 0
for j < 3 && iswhitespace(data[i+j]) && i+j < len(data) {
for j < 3 && data[i+j] == ' ' && i+j < len(data) {
j++
}
i += j + 1
Expand All @@ -2004,7 +2003,7 @@ func (p *parser) eliPrefix(data []byte) int {
}

// start with up to 3 spaces
for i < 3 && iswhitespace(data[i]) {
for i < 3 && data[i] == ' ' {
i++
}

Expand All @@ -2021,7 +2020,7 @@ func (p *parser) eliPrefix(data []byte) int {
}
}
// now two spaces
if data[i] != ')' || !iswhitespace(data[i+1]) || !iswhitespace(data[i+2]) {
if data[i] != ')' || data[i+1] != ' ' || data[i+2] != ' ' {
return 0
}
return i + 2
Expand Down Expand Up @@ -2074,7 +2073,7 @@ func (p *parser) list(out *bytes.Buffer, data []byte, flags, start int, group []
func (p *parser) listItem(out *bytes.Buffer, data []byte, flags *int) int {
// keep track of the indentation of the first line
itemIndent := 0
for itemIndent < 3 && iswhitespace(data[itemIndent]) {
for itemIndent < 3 && data[itemIndent] == ' ' {
itemIndent++
}

Expand Down Expand Up @@ -2115,7 +2114,7 @@ func (p *parser) listItem(out *bytes.Buffer, data []byte, flags *int) int {
}

// skip leading whitespace on first line
for iswhitespace(data[i]) {
for data[i] == ' ' {
i++
}

Expand Down Expand Up @@ -2264,15 +2263,15 @@ func (p *parser) renderParagraph(out *bytes.Buffer, data []byte) {
}
// trim leading spaces
beg := 0
for iswhitespace(data[beg]) {
for data[beg] == ' ' {
beg++
}

// trim trailing newline
end := len(data) - 1

// trim trailing spaces
for end > beg && iswhitespace(data[end-1]) {
for end > beg && data[end-1] == ' ' {
end--
}

Expand Down Expand Up @@ -2338,10 +2337,10 @@ func (p *parser) paragraph(out *bytes.Buffer, data []byte) int {

// ignore leading and trailing whitespace
eol := i - 1
for prev < eol && iswhitespace(data[prev]) {
for prev < eol && data[prev] == ' ' {
prev++
}
for eol > prev && iswhitespace(data[eol-1]) {
for eol > prev && data[eol-1] == ' ' {
eol--
}

Expand Down
12 changes: 12 additions & 0 deletions issue_test.go
@@ -0,0 +1,12 @@
package mmark

import "testing"

func TestIssueXXX(t *testing.T) {
tests := []string{
"абвгдеёжзийклмнопрстуфх",
"<p>абвгдеёжзийклмнопрстуфх</p>\n",
}

doTestsBlock(t, tests, 0)
}
2 changes: 0 additions & 2 deletions log.go
@@ -1,5 +1,3 @@
// Functions to parse block-level elements.

package mmark

import "log"
Expand Down
34 changes: 19 additions & 15 deletions markdown.go
Expand Up @@ -6,7 +6,6 @@ import (
"bytes"
"io/ioutil"
"path"
"unicode"
"unicode/utf8"
)

Expand Down Expand Up @@ -561,7 +560,7 @@ func isReference(p *parser, data []byte, tabSize int) int {
return 0
}
i := 0
for i < 3 && data[i] == ' ' { // break tests if this is 'iswhitespace'
for i < 3 && data[i] == ' ' {
i++
}

Expand Down Expand Up @@ -845,22 +844,27 @@ func scanAbbreviation(p *parser, data []byte, i int) (titleOffset, titleEnd, lin
}

// Miscellaneous helper functions

func ispunct(c byte) bool { return unicode.IsPunct(rune(c)) }
func isletter(c byte) bool { return unicode.IsLetter(rune(c)) }
func isalnum(c byte) bool { return (unicode.IsNumber(rune(c)) || unicode.IsLetter(rune(c))) }
func isnum(c byte) bool { return unicode.IsNumber(rune(c)) }
func isspace(c byte) bool { return unicode.IsSpace(rune(c)) }
func isupper(c byte) bool { return unicode.IsUpper(rune(c)) }
func islower(c byte) bool { return !unicode.IsUpper(rune(c)) }

func iswhitespace(c byte) bool { // better name?
if c == '\n' || c == '\r' {
return false
// Test if a character is a whitespace character.
func isspace(c byte) bool {
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
}
func ispunct(c byte) bool {
for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
if c == r {
return true
}
}
return unicode.IsSpace(rune(c))
return false
}

func isupper(c byte) bool { return (c >= 'A' && c <= 'Z') }
func isletter(c byte) bool { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') }

// Test if a character is a letter or a digit.
// TODO: check when this is looking for ASCII alnum and when it should use unicode
func isalnum(c byte) bool { return (c >= '0' && c <= '9') || isletter(c) }
func isnum(c byte) bool { return (c >= '0' && c <= '9') }

// check if the string only contains, i, v, x, c and l. If uppercase is true, check uppercase version.
func isroman(digit byte, uppercase bool) bool {
if !uppercase {
Expand Down

0 comments on commit 001cc53

Please sign in to comment.