Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

Commit

Permalink
add our fork of mtibben/confusables
Browse files Browse the repository at this point in the history
  • Loading branch information
slingamn committed Jan 31, 2019
1 parent 77ddc3d commit 0d667e5
Show file tree
Hide file tree
Showing 6 changed files with 6,623 additions and 0 deletions.
2 changes: 2 additions & 0 deletions github.com/oragono/confusables/.gitignore
@@ -0,0 +1,2 @@
/maketables
confusables.txt
28 changes: 28 additions & 0 deletions github.com/oragono/confusables/LICENSE
@@ -0,0 +1,28 @@
Copyright (c) 2013 Michael Tibben. All rights reserved.
Copyright (c) 2014 Filippo Valsorda. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
17 changes: 17 additions & 0 deletions github.com/oragono/confusables/README.md
@@ -0,0 +1,17 @@
# Unicode confusables

This Go library implements the `Skeleton` algorithm from Unicode TR39

See http://www.unicode.org/reports/tr39/

### Examples
```
import "github.com/mtibben/confusables"
confusables.Skeleton("𝔭𝒶ỿ𝕡𝕒ℓ") # "paypal"
confusables.Confusable("𝔭𝒶ỿ𝕡𝕒ℓ", "paypal") # true
```

*Note on the use of `Skeleton`, from TR39:*

> A skeleton is intended only for internal use for testing confusability of strings; the resulting text is not suitable for display to users, because it will appear to be a hodgepodge of different scripts. In particular, the result of mapping an identifier will not necessary be an identifier. Thus the confusability mappings can be used to test whether two identifiers are confusable (if their skeletons are the same), but should definitely not be used as a "normalization" of identifiers.
63 changes: 63 additions & 0 deletions github.com/oragono/confusables/confusables.go
@@ -0,0 +1,63 @@
//go:generate go run maketables.go > tables.go

package confusables

import (
"bytes"

"golang.org/x/text/unicode/norm"
)

// TODO: document casefolding approaches
// (suggest to force casefold strings; explain how to catch paypal - pAypal)
// TODO: DOC you might want to store the Skeleton and check against it later
// TODO: implement xidmodifications.txt restricted characters

func lookupReplacement(r rune) string {
return confusablesMap[r]
}

// Skeleton converts a string to it's "skeleton" form
// as descibed in http://www.unicode.org/reports/tr39/#Confusable_Detection
func Skeleton(s string) string {

// 1. Converting X to NFD format
s = norm.NFD.String(s)

// 2. Successively mapping each source character in X to the target string
// according to the specified data table
var buf bytes.Buffer
changed := false // fast path: if this remains false, keep s intact
prevPos := 0
var replacement string
for i, r := range s {
if changed && replacement == "" {
buf.WriteString(s[prevPos:i])
}
prevPos = i
replacement = lookupReplacement(r)
if replacement != "" {
if !changed {
changed = true
// first replacement: copy over the previously unmodified text
buf.WriteString(s[:i])
}
buf.WriteString(replacement)
}
}
if changed && replacement == "" {
buf.WriteString(s[prevPos:]) // loop-and-a-half
}
if changed {
s = buf.String()
}

// 3. Reapplying NFD
s = norm.NFD.String(s)

return s
}

func Confusable(x, y string) bool {
return Skeleton(x) == Skeleton(y)
}
196 changes: 196 additions & 0 deletions github.com/oragono/confusables/maketables.go
@@ -0,0 +1,196 @@
// +build ignore

// Confusables table generator.
// See http://www.unicode.org/reports/tr39/

package main

import (
"bufio"
"bytes"
"flag"
"fmt"
"go/format"
"io"
"log"
"net/http"
"os"
"strconv"
"strings"
)

func main() {
flag.Parse()
loadUnicodeData()
makeTables()
}

var url = flag.String("url",
"https://www.unicode.org/Public/security/latest/",
"URL of Unicode database directory")

var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")

// confusables.txt has form:
// 309C ; 030A ; SL #* ( ゜ → ̊ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK → COMBINING RING ABOVE # →゚→→゚→
// See http://www.unicode.org/reports/tr39/ for full explanation
// The fields:
const (
CSourceCodePoint = iota
CTargetCodePoint
CType
NumField

MaxChar = 0x10FFFF // anything above this shouldn't exist
)

func openReader(file string) (input io.ReadCloser) {
if *localFiles {
f, err := os.Open(file)
if err != nil {
log.Fatal(err)
}
input = f
} else {
path := *url + file
log.Println("Downloading " + path)
resp, err := http.Get(path)
if err != nil {
log.Fatal(err)
}
if resp.StatusCode != 200 {
log.Fatal("bad GET status for "+path, resp.Status)
}
input = resp.Body
}
return
}

func parsePoint(pointString string, line string) rune {
x, err := strconv.ParseUint(pointString, 16, 64)
point := rune(x)
if err != nil {
log.Fatalf("%.5s...: %s", line, err)
}
if point == 0 {
log.Fatalf("%5s: Unknown rune %X", line, point)
}
if point > MaxChar {
log.Fatalf("%5s: Rune %X > MaxChar (%X)", line, point, MaxChar)
}

return point
}

// Type C encapsulates a line of the confusables.txt files
type C struct {
k rune
v []rune
}

var confusables []C

func parseCharacter(line string) {
if len(line) == 0 || line[0] == '#' {
return
}
field := strings.Split(line, " ;\t")
if len(field) != NumField {
log.Fatalf("%5s: %d fields (expected %d)\n", line, len(field), NumField)
}

if !strings.HasPrefix(field[2], "MA") {
// The MA table is a superset anyway
return
}

sourceRune := parsePoint(field[CSourceCodePoint], line)
var targetRune []rune
targetCodePoints := strings.Split(field[CTargetCodePoint], " ")
for _, targetCP := range targetCodePoints {
targetRune = append(targetRune, parsePoint(targetCP, line))
}

confusables = append(confusables, C{sourceRune, targetRune})
}

var originalHeader = []byte(`
// Following is the original header of the source confusables.txt file
//
`)

func parseHeader(line string) bool {
// strip BOM
if len(line) > 3 && bytes.Compare(([]byte(line[0:3])), []byte{0xEF, 0xBB, 0xBF}) == 0 {
line = line[3:]
}
if len(line) == 0 || line[0] != '#' {
return true
}
originalHeader = append(originalHeader, "//"+line[1:]+"\n"...)
return false
}

func loadUnicodeData() {
f := openReader("confusables.txt")
defer f.Close()
scanner := bufio.NewScanner(f)
inHeader := true
for scanner.Scan() {
line := scanner.Text()
if inHeader {
inHeader = !parseHeader(line)
}
if !inHeader {
parseCharacter(line)
}
}
if scanner.Err() != nil {
log.Fatal(scanner.Err())
}
}

func makeTables() {
out := fmt.Sprintf("%s\n", originalHeader)
out += fmt.Sprint("var confusablesMap = map[rune]string{\n\n")
for _, c := range confusables {
out += fmt.Sprintf("0x%.8X: %+q,\n", c.k, string(c.v))
}
out += fmt.Sprintln("}")

WriteGoFile("tables.go", "confusables", []byte(out))
}

const header = `// This file was generated by go generate; DO NOT EDIT
package %s
`

func WriteGoFile(filename, pkg string, b []byte) {
w, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
_, err = fmt.Fprintf(w, header, pkg)
if err != nil {
log.Fatalf("Error writing header: %v", err)
}
// Strip leading newlines.
for len(b) > 0 && b[0] == '\n' {
b = b[1:]
}
formatted, err := format.Source(b)
if err != nil {
// Print the original buffer even in case of an error so that the
// returned error can be meaningfully interpreted.
w.Write(b)
log.Fatalf("Error formatting file %s: %v", filename, err)
}
if _, err := w.Write(formatted); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}

0 comments on commit 0d667e5

Please sign in to comment.