-
Notifications
You must be signed in to change notification settings - Fork 31
/
fuzzy_match.c
207 lines (183 loc) · 5.09 KB
/
fuzzy_match.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "fuzzy_match.h"
#include "utf8.h"
#include "xmalloc.h"
#undef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
static int32_t compute_score(
int32_t jump,
bool first_char,
const char *restrict match);
static int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_char);
/*
* Split patterns into words, and perform simple matching against str for each.
* Returns the sum of substring distances from the start of str.
* If a word is not found, returns INT32_MIN.
*/
int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *restrict str)
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
char *c = utf8_strcasestr(str, pattern);
if (c == NULL) {
score = INT32_MIN;
break;
} else {
score += str - c;
}
pattern = strtok_r(NULL, " ", &saveptr);
}
free(tmp);
return score;
}
/*
* Split patterns into words, and return the sum of fuzzy_match(word, str).
* If a word is not found, returns INT32_MIN.
*/
int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict str)
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
int32_t word_score = fuzzy_match(pattern, str);
if (word_score == INT32_MIN) {
score = INT32_MIN;
break;
} else {
score += word_score;
}
pattern = strtok_r(NULL, " ", &saveptr);
}
free(tmp);
return score;
}
/*
* Returns score if each character in pattern is found sequentially within str.
* Returns INT32_MIN otherwise.
*/
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
const int unmatched_letter_penalty = -1;
const size_t slen = utf8_strlen(str);
const size_t plen = utf8_strlen(pattern);
int32_t score = 0;
if (*pattern == '\0') {
return score;
}
if (slen < plen) {
return INT32_MIN;
}
/* We can already penalise any unused letters. */
score += unmatched_letter_penalty * (int32_t)(slen - plen);
/* Perform the match. */
score = fuzzy_match_recurse(pattern, str, score, true);
return score;
}
/*
* Recursively match the whole of pattern against str.
* The score parameter is the score of the previously matched character.
*
* This reaches a maximum recursion depth of strlen(pattern) + 1. However, the
* stack usage is small (the maximum I've seen on x86_64 is 144 bytes with
* gcc -O3), so this shouldn't matter unless pattern contains thousands of
* characters.
*/
int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_char)
{
if (*pattern == '\0') {
/* We've matched the full pattern. */
return score;
}
const char *match = str;
uint32_t search = utf8_get_char(pattern);
int32_t best_score = INT32_MIN;
/*
* Find all occurrences of the next pattern character in str, and
* recurse on them.
*/
while ((match = utf8_strcasechr(match, search)) != NULL) {
int32_t jump = 0;
for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
jump++;
}
int32_t subscore = fuzzy_match_recurse(
utf8_next_char(pattern),
utf8_next_char(match),
compute_score(jump, first_char, match),
false);
best_score = MAX(best_score, subscore);
match++;
}
if (best_score == INT32_MIN) {
/* We couldn't match the rest of the pattern. */
return INT32_MIN;
} else {
return score + best_score;
}
}
/*
* Calculate the score for a single matching letter.
* The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith,
* which is licensed to the public domain.
*
* The factors affecting score are:
* - Bonuses:
* - If there are multiple adjacent matches.
* - If a match occurs after a separator character.
* - If a match is uppercase, and the previous character is lowercase.
*
* - Penalties:
* - If there are letters before the first match.
* - If there are superfluous characters in str (already accounted for).
*/
int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
{
const int adjacency_bonus = 15;
const int separator_bonus = 30;
const int camel_bonus = 30;
const int first_letter_bonus = 15;
const int leading_letter_penalty = -5;
const int max_leading_letter_penalty = -15;
int32_t score = 0;
const uint32_t cur = utf8_get_char(match);
/* Apply bonuses. */
if (!first_char && jump == 0) {
score += adjacency_bonus;
}
if (!first_char || jump > 0) {
const uint32_t prev = utf8_get_char(utf8_prev_char(match));
if (utf8_isupper(cur) && utf8_islower(prev)) {
score += camel_bonus;
}
if (utf8_isalnum(cur) && !utf8_isalnum(prev)) {
score += separator_bonus;
}
}
if (first_char && jump == 0) {
/* Match at start of string gets separator bonus. */
score += first_letter_bonus;
}
/* Apply penalties. */
if (first_char) {
score += MAX(leading_letter_penalty * jump,
max_leading_letter_penalty);
}
return score;
}