Experimental word-boundary ranking algorithm

See the comment in algo/algo.go for a description.
mjwestcott · Apr 1, 2016 · 60c9d84 · 60c9d84
1 parent 8d6e13b
commit 60c9d84
Show file tree

Hide file tree

Showing 16 changed files with 206 additions and 96 deletions.
diff --git a/src/algo/algo.go b/src/algo/algo.go
@@ -4,7 +4,7 @@ import (
 	"strings"
 	"unicode"
 
-	"github.com/junegunn/fzf/src/util"
+	"github.com/mjwestcott/fzf/src/util"
 )
 
 /*
@@ -22,10 +22,41 @@ func runeAt(runes []rune, index int, max int, forward bool) rune {
 	return runes[max-index-1]
 }
 
+// Result conatins the results of running a match function.
+type Result struct {
+	Start int
+	End   int
+
+	// Every result is assigned a penalty based on the distances of the
+	// matching runes from the beginning of its containing word. The basic
+	// idea is to assign values to each rune in the input text. Then,
+	// add up those values which are matched by the pattern. The only nuance
+	// is that consecutive matches have no penalty.
+	//
+	//     input    "Hello, world! This is a test."
+	//     values    12345--12345--1234-12-1-1234-
+	//     pattern          wo     th        tes
+	//     penalties        10     10        100
+	//     total = 3
+	//
+	// Now an example that should be heavily penalized because many of the
+	// matches occur in the middle of words:
+	//
+	//     input    "/usr/jg/repos/go/src/github.com/junegunn"
+	//     values    -123-12-12345-12-123-123456-123-12345678
+	//     pattern     s       p   g      git            gunn
+	//     penalties   2       3   1      100            5000
+	//     total = 12
+	//
+	// We can then decide how to use that penalty when ranking items. One
+	// simple and effective idea is to rank according to matchlen + penalty.
+	Penalty int32
+}
+
 // FuzzyMatch performs fuzzy-match
-func FuzzyMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) (int, int) {
+func FuzzyMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) *Result {
 	if len(pattern) == 0 {
-		return 0, 0
+		return &Result{0, 0, 0}
 	}
 
 	// 0. (FIXME) How to find the shortest match?
@@ -46,6 +77,7 @@ func FuzzyMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune)
 
 	for index := range runes {
 		char := runeAt(runes, index, lenRunes, forward)
+
 		// This is considerably faster than blindly applying strings.ToLower to the
 		// whole string
 		if !caseSensitive {
@@ -90,12 +122,57 @@ func FuzzyMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune)
 				}
 			}
 		}
+
+		// Calculate the penalty. This can't be done at the same time as the
+		// pattern scan above because 'forward' may be false.
+		var fromBoundary int32
+		var totalPenalty int32
+		var consecutive bool
+		var pidx int
+
+		// We can think about how to start closer to sidx.
+		for index := 0; index < eidx; index++ {
+			var penalty int32
+
+			// Calculate current rune penalty.
+			char := runes[index]
+			if unicode.IsLetter(char) || unicode.IsNumber(char) {
+				fromBoundary++
+				penalty = fromBoundary
+			} else {
+				fromBoundary = 0
+			}
+
+			// Calculate totalPenalty of the match.
+			if index >= sidx {
+				if !caseSensitive {
+					if char >= 'A' && char <= 'Z' {
+						char += 32
+					} else if char > unicode.MaxASCII {
+						char = unicode.To(unicode.LowerCase, char)
+					}
+				}
+				pchar := pattern[pidx]
+				if pchar == char {
+					if !consecutive {
+						totalPenalty += penalty
+					}
+					if pidx++; pidx == lenPattern {
+						break
+					}
+					consecutive = true
+				} else {
+					consecutive = false
+				}
+			}
+		}
+
 		if forward {
-			return sidx, eidx
+			return &Result{sidx, eidx, totalPenalty}
 		}
-		return lenRunes - eidx, lenRunes - sidx
+		return &Result{lenRunes - eidx, lenRunes - sidx, totalPenalty}
 	}
-	return -1, -1
+	return &Result{-1, -1, 0}
 }
 
 // ExactMatchNaive is a basic string searching algorithm that handles case
@@ -105,16 +182,17 @@ func FuzzyMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune)
 //
 // We might try to implement better algorithms in the future:
 // http://en.wikipedia.org/wiki/String_searching_algorithm
-func ExactMatchNaive(caseSensitive bool, forward bool, runes []rune, pattern []rune) (int, int) {
+func ExactMatchNaive(caseSensitive bool, forward bool, runes []rune, pattern []rune) *Result {
+	// Note: ExactMatchNaive always return a zero penalty.
 	if len(pattern) == 0 {
-		return 0, 0
+		return &Result{0, 0, 0}
 	}
 
 	lenRunes := len(runes)
 	lenPattern := len(pattern)
 
 	if lenRunes < lenPattern {
-		return -1, -1
+		return &Result{-1, -1, 0}
 	}
 
 	pidx := 0
@@ -132,22 +210,23 @@ func ExactMatchNaive(caseSensitive bool, forward bool, runes []rune, pattern []r
 			pidx++
 			if pidx == lenPattern {
 				if forward {
-					return index - lenPattern + 1, index + 1
+					return &Result{index - lenPattern + 1, index + 1, 0}
 				}
-				return lenRunes - (index + 1), lenRunes - (index - lenPattern + 1)
+				return &Result{lenRunes - (index + 1), lenRunes - (index - lenPattern + 1), 0}
 			}
 		} else {
 			index -= pidx
 			pidx = 0
 		}
 	}
-	return -1, -1
+	return &Result{-1, -1, 0}
 }
 
 // PrefixMatch performs prefix-match
-func PrefixMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) (int, int) {
+func PrefixMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) *Result {
+	// Note: PrefixMatch always return a zero penalty.
 	if len(runes) < len(pattern) {
-		return -1, -1
+		return &Result{-1, -1, 0}
 	}
 
 	for index, r := range pattern {
@@ -156,44 +235,47 @@ func PrefixMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune)
 			char = unicode.ToLower(char)
 		}
 		if char != r {
-			return -1, -1
+			return &Result{-1, -1, 0}
 		}
 	}
-	return 0, len(pattern)
+	return &Result{0, len(pattern), 0}
 }
 
 // SuffixMatch performs suffix-match
-func SuffixMatch(caseSensitive bool, forward bool, input []rune, pattern []rune) (int, int) {
+func SuffixMatch(caseSensitive bool, forward bool, input []rune, pattern []rune) *Result {
+	// Note: SuffixMatch always return a zero penalty.
 	runes := util.TrimRight(input)
 	trimmedLen := len(runes)
 	diff := trimmedLen - len(pattern)
 	if diff < 0 {
-		return -1, -1
+		return &Result{-1, -1, 0}
 	}
 
 	for index, r := range pattern {
 		char := runes[index+diff]
+
 		if !caseSensitive {
 			char = unicode.ToLower(char)
 		}
 		if char != r {
-			return -1, -1
+			return &Result{-1, -1, 0}
 		}
 	}
-	return trimmedLen - len(pattern), trimmedLen
+	return &Result{trimmedLen - len(pattern), trimmedLen, 0}
 }
 
 // EqualMatch performs equal-match
-func EqualMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) (int, int) {
+func EqualMatch(caseSensitive bool, forward bool, runes []rune, pattern []rune) *Result {
+	// Note: EqualMatch always return a zero penalty.
 	if len(runes) != len(pattern) {
-		return -1, -1
+		return &Result{-1, -1, 0}
 	}
 	runesStr := string(runes)
 	if !caseSensitive {
 		runesStr = strings.ToLower(runesStr)
 	}
 	if runesStr == string(pattern) {
-		return 0, len(pattern)
+		return &Result{0, len(pattern), 0}
 	}
-	return -1, -1
+	return &Result{-1, -1, 0}
 }
diff --git a/src/algo/algo_test.go b/src/algo/algo_test.go
@@ -5,65 +5,82 @@ import (
 	"testing"
 )
 
-func assertMatch(t *testing.T, fun func(bool, bool, []rune, []rune) (int, int), caseSensitive bool, forward bool, input string, pattern string, sidx int, eidx int) {
+func assertMatch(t *testing.T, fun func(bool, bool, []rune, []rune) *Result, caseSensitive, forward bool, input, pattern string, sidx, eidx int, penalty int32) {
 	if !caseSensitive {
 		pattern = strings.ToLower(pattern)
 	}
-	s, e := fun(caseSensitive, forward, []rune(input), []rune(pattern))
-	if s != sidx {
-		t.Errorf("Invalid start index: %d (expected: %d, %s / %s)", s, sidx, input, pattern)
+	res := fun(caseSensitive, forward, []rune(input), []rune(pattern))
+	if res.Start != sidx {
+		t.Errorf("Invalid start index: %d (expected: %d, %s / %s)", res.Start, sidx, input, pattern)
 	}
-	if e != eidx {
-		t.Errorf("Invalid end index: %d (expected: %d, %s / %s)", e, eidx, input, pattern)
+	if res.End != eidx {
+		t.Errorf("Invalid end index: %d (expected: %d, %s / %s)", res.End, eidx, input, pattern)
+	}
+	if res.Penalty != penalty {
+		t.Errorf("Invalid penalty: %d (expected: %d, %s / %s)", res.Penalty, penalty, input, pattern)
 	}
 }
 
 func TestFuzzyMatch(t *testing.T) {
-	assertMatch(t, FuzzyMatch, false, true, "fooBarbaz", "oBZ", 2, 9)
-	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "oBZ", -1, -1)
-	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "oBz", 2, 9)
-	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "fooBarbazz", -1, -1)
+	assertMatch(t, FuzzyMatch, false, true, "fooBarbaz", "oBZ", 2, 9, 12)
+	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "oBZ", -1, -1, 0)
+	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "oBz", 2, 9, 12)
+	assertMatch(t, FuzzyMatch, true, true, "fooBarbaz", "fooBarbazz", -1, -1, 0)
+
+	assertMatch(t, FuzzyMatch, false, true, "foo bar baz", "fbb", 0, 9, 3)
+	assertMatch(t, FuzzyMatch, false, true, "foo/bar/baz", "fbb", 0, 9, 3)
+	assertMatch(t, FuzzyMatch, false, true, "foo barbaz", "fbb", 0, 8, 6)
+	assertMatch(t, FuzzyMatch, false, true, "fooBar Baz", "foob", 0, 4, 1)
+	assertMatch(t, FuzzyMatch, true, true, "Foo Bar Baz", "fbb", -1, -1, 0)
+	assertMatch(t, FuzzyMatch, true, true, "Foo/Bar/Baz", "FBB", 0, 9, 3)
+	assertMatch(t, FuzzyMatch, true, true, "foo BarBaz", "fBB", 0, 8, 6)
+	assertMatch(t, FuzzyMatch, true, true, "FooBar Baz", "FooB", 0, 4, 1)
 }
 
 func TestFuzzyMatchBackward(t *testing.T) {
-	assertMatch(t, FuzzyMatch, false, true, "foobar fb", "fb", 0, 4)
-	assertMatch(t, FuzzyMatch, false, false, "foobar fb", "fb", 7, 9)
+	assertMatch(t, FuzzyMatch, false, true, "foobar fb", "fb", 0, 4, 5)
+	assertMatch(t, FuzzyMatch, false, false, "foobar fb", "fb", 7, 9, 1)
 }
 
 func TestExactMatchNaive(t *testing.T) {
 	for _, dir := range []bool{true, false} {
-		assertMatch(t, ExactMatchNaive, false, dir, "fooBarbaz", "oBA", 2, 5)
-		assertMatch(t, ExactMatchNaive, true, dir, "fooBarbaz", "oBA", -1, -1)
-		assertMatch(t, ExactMatchNaive, true, dir, "fooBarbaz", "fooBarbazz", -1, -1)
+		assertMatch(t, ExactMatchNaive, false, dir, "fooBarbaz", "oBA", 2, 5, 0)
+		assertMatch(t, ExactMatchNaive, true, dir, "fooBarbaz", "oBA", -1, -1, 0)
+		assertMatch(t, ExactMatchNaive, true, dir, "fooBarbaz", "fooBarbazz", -1, -1, 0)
 	}
 }
 
 func TestExactMatchNaiveBackward(t *testing.T) {
-	assertMatch(t, ExactMatchNaive, false, true, "foobar foob", "oo", 1, 3)
-	assertMatch(t, ExactMatchNaive, false, false, "foobar foob", "oo", 8, 10)
+<<<<<<< 8d6e13bf94234addd3801f25de7d966d32d53133
+	assertMatch(t, ExactMatchNaive, false, true, "foobar foob", "oo", 1, 3, 0)
+	assertMatch(t, ExactMatchNaive, false, false, "foobar foob", "oo", 8, 10, 0)
+=======
+	assertMatch(t, ExactMatchNaive, false, true, "foobar foob", "oo", 1, 3, 0)
+	assertMatch(t, ExactMatchNaive, false, false, "foobar foob", "oo", 8, 10, 0)
+>>>>>>> Experimental word-boundary ranking algorithm
 }
 
 func TestPrefixMatch(t *testing.T) {
 	for _, dir := range []bool{true, false} {
-		assertMatch(t, PrefixMatch, false, dir, "fooBarbaz", "Foo", 0, 3)
-		assertMatch(t, PrefixMatch, true, dir, "fooBarbaz", "Foo", -1, -1)
-		assertMatch(t, PrefixMatch, false, dir, "fooBarbaz", "baz", -1, -1)
+		assertMatch(t, PrefixMatch, false, dir, "fooBarbaz", "Foo", 0, 3, 0)
+		assertMatch(t, PrefixMatch, true, dir, "fooBarbaz", "Foo", -1, -1, 0)
+		assertMatch(t, PrefixMatch, false, dir, "fooBarbaz", "baz", -1, -1, 0)
 	}
 }
 
 func TestSuffixMatch(t *testing.T) {
 	for _, dir := range []bool{true, false} {
-		assertMatch(t, SuffixMatch, false, dir, "fooBarbaz", "Foo", -1, -1)
-		assertMatch(t, SuffixMatch, false, dir, "fooBarbaz", "baz", 6, 9)
-		assertMatch(t, SuffixMatch, true, dir, "fooBarbaz", "Baz", -1, -1)
+		assertMatch(t, SuffixMatch, false, dir, "fooBarbaz", "Foo", -1, -1, 0)
+		assertMatch(t, SuffixMatch, false, dir, "fooBarbaz", "baz", 6, 9, 0)
+		assertMatch(t, SuffixMatch, true, dir, "fooBarbaz", "Baz", -1, -1, 0)
 	}
 }
 
 func TestEmptyPattern(t *testing.T) {
 	for _, dir := range []bool{true, false} {
-		assertMatch(t, FuzzyMatch, true, dir, "foobar", "", 0, 0)
-		assertMatch(t, ExactMatchNaive, true, dir, "foobar", "", 0, 0)
-		assertMatch(t, PrefixMatch, true, dir, "foobar", "", 0, 0)
-		assertMatch(t, SuffixMatch, true, dir, "foobar", "", 6, 6)
+		assertMatch(t, FuzzyMatch, true, dir, "foobar", "", 0, 0, 0)
+		assertMatch(t, ExactMatchNaive, true, dir, "foobar", "", 0, 0, 0)
+		assertMatch(t, PrefixMatch, true, dir, "foobar", "", 0, 0, 0)
+		assertMatch(t, SuffixMatch, true, dir, "foobar", "", 6, 6, 0)
 	}
 }
diff --git a/src/constants.go b/src/constants.go
@@ -3,7 +3,7 @@ package fzf
 import (
 	"time"
 
-	"github.com/junegunn/fzf/src/util"
+	"github.com/mjwestcott/fzf/src/util"
 )
 
 const (

diff --git a/src/core.go b/src/core.go
@@ -31,7 +31,7 @@ import (
 	"runtime"
 	"time"
 
-	"github.com/junegunn/fzf/src/util"
+	"github.com/mjwestcott/fzf/src/util"
 )
 
 func initProcs() {

diff --git a/src/fzf/main.go b/src/fzf/main.go
@@ -1,6 +1,6 @@
 package main
 
-import "github.com/junegunn/fzf/src"
+import "github.com/mjwestcott/fzf/src"
 
 func main() {
 	fzf.Run(fzf.ParseOptions())

diff --git a/src/item.go b/src/item.go
@@ -3,7 +3,7 @@ package fzf
 import (
 	"math"
 
-	"github.com/junegunn/fzf/src/curses"
+	"github.com/mjwestcott/fzf/src/curses"
 )
 
 // Offset holds three 32-bit integers denoting the offsets of a matched substring
@@ -21,6 +21,7 @@ type Item struct {
 	origText    *[]rune
 	transformed []Token
 	offsets     []Offset
+	penalty     int32
 	colors      []ansiOffset
 	rank        [5]int32
 }
@@ -81,7 +82,8 @@ func (item *Item) Rank(cache bool) [5]int32 {
 		var val int32
 		switch criterion {
 		case byMatchLen:
-			val = int32(matchlen)
+			// A simple and effective way to incorporate the penalty.
+			val = int32(matchlen) + item.penalty
 		case byLength:
 			// It is guaranteed that .transformed in not null in normal execution
 			if item.transformed != nil {