Skip to content

Commit

Permalink
Improve UTF-8 handling.
Browse files Browse the repository at this point in the history
This should allow case-insensitive matching for non-Latin characters,
and fix matching for characters with diacritics.
  • Loading branch information
philj56 committed Oct 18, 2022
1 parent 8872f66 commit 5482f0b
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 16 deletions.
3 changes: 3 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ tofi_sources = files(
'src/shm.c',
'src/string_vec.c',
'src/surface.c',
'src/utf8.c',
'src/wlr-layer-shell-unstable-v1.c',
'src/xmalloc.c',
)
Expand All @@ -119,6 +120,7 @@ compgen_sources = files(
'src/log.c',
'src/mkdirp.c',
'src/string_vec.c',
'src/utf8.c',
'src/xmalloc.c'
)

Expand Down Expand Up @@ -183,6 +185,7 @@ executable(
executable(
'tofi-compgen',
compgen_sources,
dependencies: [glib],
install: false
)

Expand Down
6 changes: 5 additions & 1 deletion src/desktop_vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "fuzzy_match.h"
#include "log.h"
#include "string_vec.h"
#include "utf8.h"
#include "xmalloc.h"

static bool match_current_desktop(char * const *desktop_list, gsize length);
Expand Down Expand Up @@ -42,7 +43,10 @@ void desktop_vec_add(
vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
}
vec->buf[vec->count].id = xstrdup(id);
vec->buf[vec->count].name = xstrdup(name);
vec->buf[vec->count].name = utf8_normalize(name);
if (vec->buf[vec->count].name == NULL) {
vec->buf[vec->count].name = xstrdup(name);
}
vec->buf[vec->count].path = xstrdup(path);
vec->buf[vec->count].keywords = xstrdup(keywords);
vec->buf[vec->count].search_score = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/entry_backend/harfbuzz.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "../entry.h"
#include "../log.h"
#include "../nelem.h"
#include "../utf8.h"
#include "../xmalloc.h"

/*
Expand Down Expand Up @@ -368,7 +369,7 @@ void entry_backend_harfbuzz_update(struct entry *entry)
char *postmatch = NULL;
cairo_text_extents_t subextents;
if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
char *match_pos = strcasestr(prematch, entry->input_mb);
char *match_pos = utf8_strcasestr(prematch, entry->input_mb);
if (match_pos != NULL) {
match = xstrdup(result);
prematch_len = (match_pos - prematch);
Expand Down
3 changes: 2 additions & 1 deletion src/entry_backend/pango.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "../entry.h"
#include "../log.h"
#include "../nelem.h"
#include "../utf8.h"
#include "../xmalloc.h"

#undef MAX
Expand Down Expand Up @@ -181,7 +182,7 @@ void entry_backend_pango_update(struct entry *entry)
PangoRectangle ink_subrect;
PangoRectangle logical_subrect;
if (entry->input_mb_length > 0 && entry->selection_highlight_color.a != 0) {
char *match_pos = strcasestr(str, entry->input_mb);
char *match_pos = utf8_strcasestr(str, entry->input_mb);
if (match_pos != NULL) {
prematch_len = (match_pos - str);
postmatch_len = strlen(str) - prematch_len - match_len;
Expand Down
32 changes: 20 additions & 12 deletions src/fuzzy_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <string.h>

#include "fuzzy_match.h"
#include "utf8.h"
#include "xmalloc.h"

#undef MAX
Expand All @@ -30,10 +31,10 @@ int32_t fuzzy_match_simple_words(const char *restrict patterns, const char *rest
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = xstrdup(patterns);
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
char *c = strcasestr(str, pattern);
char *c = utf8_strcasestr(str, pattern);
if (c == NULL) {
score = INT32_MIN;
break;
Expand All @@ -55,7 +56,7 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
{
int32_t score = 0;
char *saveptr = NULL;
char *tmp = xstrdup(patterns);
char *tmp = utf8_normalize(patterns);
char *pattern = strtok_r(tmp, " ", &saveptr);
while (pattern != NULL) {
int32_t word_score = fuzzy_match(pattern, str);
Expand All @@ -78,8 +79,8 @@ int32_t fuzzy_match_words(const char *restrict patterns, const char *restrict st
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
const int unmatched_letter_penalty = -1;
const size_t slen = strlen(str);
const size_t plen = strlen(pattern);
const size_t slen = utf8_strlen(str);
const size_t plen = utf8_strlen(pattern);
int32_t score = 0;

if (*pattern == '\0') {
Expand Down Expand Up @@ -119,19 +120,23 @@ int32_t fuzzy_match_recurse(
}

const char *match = str;
const char search[2] = { *pattern, '\0' };
uint32_t search = utf8_get_char(pattern);

int32_t best_score = INT32_MIN;

/*
* Find all occurrences of the next pattern character in str, and
* recurse on them.
*/
while ((match = strcasestr(match, search)) != NULL) {
while ((match = utf8_strcasechr(match, search)) != NULL) {
int32_t jump = 0;
for (const char *tmp = str; tmp != match; tmp = utf8_next_char(tmp)) {
jump++;
}
int32_t subscore = fuzzy_match_recurse(
pattern + 1,
match + 1,
compute_score(match - str, first_char, match),
utf8_next_char(pattern),
utf8_next_char(match),
compute_score(jump, first_char, match),
false);
best_score = MAX(best_score, subscore);
match++;
Expand Down Expand Up @@ -172,15 +177,18 @@ int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)

int32_t score = 0;

const uint32_t cur = utf8_get_char(match);

/* Apply bonuses. */
if (!first_char && jump == 0) {
score += adjacency_bonus;
}
if (!first_char || jump > 0) {
if (isupper(*match) && islower(*(match - 1))) {
const uint32_t prev = utf8_get_char(utf8_prev_char(match));
if (utf8_isupper(cur) && utf8_islower(prev)) {
score += camel_bonus;
}
if (isalnum(*match) && !isalnum(*(match - 1))) {
if (utf8_isalnum(cur) && !utf8_isalnum(prev)) {
score += separator_bonus;
}
}
Expand Down
1 change: 1 addition & 0 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "nelem.h"
#include "shm.h"
#include "string_vec.h"
#include "string_vec.h"
#include "xmalloc.h"

#undef MAX
Expand Down
6 changes: 5 additions & 1 deletion src/string_vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <sys/mman.h>
#include "fuzzy_match.h"
#include "string_vec.h"
#include "utf8.h"
#include "xmalloc.h"

static int cmpstringp(const void *restrict a, const void *restrict b)
Expand Down Expand Up @@ -80,7 +81,10 @@ void string_vec_add(struct string_vec *restrict vec, const char *restrict str)
vec->size *= 2;
vec->buf = xrealloc(vec->buf, vec->size * sizeof(vec->buf[0]));
}
vec->buf[vec->count].string = xstrdup(str);
vec->buf[vec->count].string = utf8_normalize(str);
if (vec->buf[vec->count].string == NULL) {
vec->buf[vec->count].string = xstrdup(str);
}
vec->buf[vec->count].search_score = 0;
vec->buf[vec->count].history_score = 0;
vec->count++;
Expand Down
92 changes: 92 additions & 0 deletions src/utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include <string.h>

#include "utf8.h"

uint32_t utf8_isupper(uint32_t c)
{
return g_unichar_isupper(c);
}

uint32_t utf8_islower(uint32_t c)
{
return g_unichar_islower(c);
}

uint32_t utf8_isalnum(uint32_t c)
{
return g_unichar_isalnum(c);
}

uint32_t utf8_toupper(uint32_t c)
{
return g_unichar_toupper(c);
}

uint32_t utf8_tolower(uint32_t c)
{
return g_unichar_tolower(c);
}

uint32_t utf8_get_char(const char *s)
{
return g_utf8_get_char(s);
}

char *utf8_next_char(const char *s)
{
return g_utf8_next_char(s);
}

char *utf8_prev_char(const char *s)
{
return g_utf8_prev_char(s);
}

char *utf8_strchr(const char *s, uint32_t c)
{
return g_utf8_strchr(s, -1, c);
}

char *utf8_strcasechr(const char *s, uint32_t c)
{
c = g_unichar_tolower(c);

const char *p = s;
while (*p != '\0' && g_unichar_tolower(g_utf8_get_char(p)) != c) {
p = g_utf8_next_char(p);
}
if (*p == '\0') {
return NULL;
}
return (char *)p;
}

size_t utf8_strlen(const char *s)
{
return g_utf8_strlen(s, -1);
}

char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle)
{
char *h = g_utf8_casefold(haystack, -1);
char *n = g_utf8_casefold(needle, -1);

char *cmp = strstr(h, n);
char *ret;

if (cmp == NULL) {
ret = NULL;
} else {
ret = (char *)haystack + (cmp - h);
}

free(h);
free(n);

return ret;
}

char *utf8_normalize(const char *s)
{
return g_utf8_normalize(s, -1, G_NORMALIZE_DEFAULT);
}
22 changes: 22 additions & 0 deletions src/utf8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef UTF8_H
#define UTF8_H

#include <glib.h>
#include <stdint.h>

uint32_t utf8_isupper(uint32_t c);
uint32_t utf8_islower(uint32_t c);
uint32_t utf8_isalnum(uint32_t c);
uint32_t utf8_toupper(uint32_t c);
uint32_t utf8_tolower(uint32_t c);

uint32_t utf8_get_char(const char *s);
char *utf8_next_char(const char *s);
char *utf8_prev_char(const char *s);
char *utf8_strchr(const char *s, uint32_t c);
char *utf8_strcasechr(const char *s, uint32_t c);
size_t utf8_strlen(const char *s);
char *utf8_strcasestr(const char * restrict haystack, const char * restrict needle);
char *utf8_normalize(const char *s);

#endif /* UTF8_H */

0 comments on commit 5482f0b

Please sign in to comment.