-
Notifications
You must be signed in to change notification settings - Fork 36
/
fuzzy_match.c
238 lines (212 loc) · 6.34 KB
/
fuzzy_match.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "fuzzy_match.h"
#include "utf8.h"
#include "xmalloc.h"
#undef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
static int32_t compute_score(
int32_t jump,
bool first_char,
const char *restrict match);
static int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_match_only,
bool first_char);
/*
* Split patterns into words, and perform simple matching against str for each.
* Returns the sum of substring distances from the start of str.
* If a word is not found, returns INT32_MIN.
*/
int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *restrict str)
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
char *c = utf8_strcasestr(str, pattern);
if (c == NULL) {
score = INT32_MIN;
break;
} else {
score += str - c;
}
pattern = strtok_r(NULL, " ", &saveptr);
}
free(tmp);
return score;
}
/*
* Split patterns into words, and return the sum of fuzzy_match(word, str).
* If a word is not found, returns INT32_MIN.
*/
int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str)
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
int32_t word_score = fuzzy_match(pattern, str);
if (word_score == INT32_MIN) {
score = INT32_MIN;
break;
} else {
score += word_score;
}
pattern = strtok_r(NULL, " ", &saveptr);
}
free(tmp);
return score;
}
/*
* Returns score if each character in pattern is found sequentially within str.
* Returns INT32_MIN otherwise.
*/
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
const int unmatched_letter_penalty = -1;
const size_t slen = utf8_strlen(str);
const size_t plen = utf8_strlen(pattern);
int32_t score = 0;
if (*pattern == '\0') {
return score;
}
if (slen < plen) {
return INT32_MIN;
}
/* We can already penalise any unused letters. */
score += unmatched_letter_penalty * (int32_t)(slen - plen);
/*
* If the string is more than 100 characters, just find the first fuzzy
* match rather than the best.
*
* This is required as the number of possible matches (for patterns and
* strings all consisting of one letter) scales something like:
*
* slen! / (plen! (slen - plen)!) ~ slen^plen for plen << slen
*
* This quickly grinds everything to a halt. 100 is chosen fairly
* arbitrarily from the following logic:
*
* - e is the most common character in English, at around 13% of
* letters. Depending on the context, let's say this be up to 20%.
* - 100 * 0.20 = 20 repeats of the same character.
* - In the worst case here, 20! / (10! 10!) ~200,000 possible matches,
* which is "slow but not frozen" for my machine.
*
* In reality, this worst case shouldn't be hit, and finding the "best"
* fuzzy match in lines of text > 100 characters isn't really in scope
* for a dmenu clone.
*/
bool first_match_only = slen > 100;
/* Perform the match. */
score = fuzzy_match_recurse(pattern, str, score, first_match_only, true);
return score;
}
/*
* Recursively match the whole of pattern against str.
* The score parameter is the score of the previously matched character.
*
* This reaches a maximum recursion depth of strlen(pattern) + 1. However, the
* stack usage is small (the maximum I've seen on x86_64 is 144 bytes with
* gcc -O3), so this shouldn't matter unless pattern contains thousands of
* characters.
*/
int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_match_only,
bool first_char)
{
if (*pattern == '\0') {
/* We've matched the full pattern. */
return score;
}
const char *match = str;
uint32_t search = utf8_get_char(pattern);
int32_t best_score = INT32_MIN;
/*
* Find all occurrences of the next pattern character in str, and
* recurse on them.
*/
while ((match = utf8_strcasechr(match, search)) != NULL) {
int32_t jump = 0;
for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
jump++;
}
int32_t subscore = fuzzy_match_recurse(
utf8_next_char(pattern),
utf8_next_char(match),
compute_score(jump, first_char, match),
first_match_only,
false);
best_score = MAX(best_score, subscore);
match = utf8_next_char(match);
if (first_match_only) {
break;
}
}
if (best_score == INT32_MIN) {
/* We couldn't match the rest of the pattern. */
return INT32_MIN;
} else {
return score + best_score;
}
}
/*
* Calculate the score for a single matching letter.
* The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith,
* which is licensed to the public domain.
*
* The factors affecting score are:
* - Bonuses:
* - If there are multiple adjacent matches.
* - If a match occurs after a separator character.
* - If a match is uppercase, and the previous character is lowercase.
*
* - Penalties:
* - If there are letters before the first match.
* - If there are superfluous characters in str (already accounted for).
*/
int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
{
const int adjacency_bonus = 15;
const int separator_bonus = 30;
const int camel_bonus = 30;
const int first_letter_bonus = 15;
const int leading_letter_penalty = -5;
const int max_leading_letter_penalty = -15;
int32_t score = 0;
const uint32_t cur = utf8_get_char(match);
/* Apply bonuses. */
if (!first_char && jump == 0) {
score += adjacency_bonus;
}
if (!first_char || jump > 0) {
const uint32_t prev = utf8_get_char(utf8_prev_char(match));
if (utf8_isupper(cur) && utf8_islower(prev)) {
score += camel_bonus;
}
if (utf8_isalnum(cur) && !utf8_isalnum(prev)) {
score += separator_bonus;
}
}
if (first_char && jump == 0) {
/* Match at start of string gets separator bonus. */
score += first_letter_bonus;
}
/* Apply penalties. */
if (first_char) {
score += MAX(leading_letter_penalty * jump,
max_leading_letter_penalty);
}
return score;
}