Skip to content

Commit

Permalink
Use sqlite for soundex() and soundslike(), support the spellfix1 phon…
Browse files Browse the repository at this point in the history
…etic hash as an option.
  • Loading branch information
shawnw committed May 12, 2018
1 parent ef5d585 commit 61ace7e
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 120 deletions.
1 change: 1 addition & 0 deletions CHANGES.187.md
Expand Up @@ -59,6 +59,7 @@ Softcode
* `timecalc()` and `secscalc()` for adding/subtracting intervals from times.
* `@suggest` and `suggest()` for user-defined word suggestion dictionaries.
* `connlog()` and `connrecord()` for interfacing with enhanced connection logs.
* `soundex()` and `soundslike()` now support a second phonetic hash besides soundex.

Fixes
-----
Expand Down
1 change: 1 addition & 0 deletions config.h.in
Expand Up @@ -370,6 +370,7 @@ typedef bool _Bool;
#define SQLITE_OMIT_DEPRECATED 1
#define SQLITE_OMIT_DECLTYPE 1
#define SQLITE_OMIT_LOAD_EXTENSION 1
#define SQLITE_SOUNDEX 1
#ifdef HAVE_LIBZ
#define SQLITE_HAVE_ZLIB 1
#endif
Expand Down
14 changes: 8 additions & 6 deletions game/txt/hlp/pennfunc.hlp
Expand Up @@ -3891,7 +3891,7 @@ See also: anonymous attributes, sorting, sortby()

See also: sort(), sortby(), sortkey(), setunion(), setinter(), setdiff()
& SOUNDEX()
soundex(<word>)
soundex(<word>[, <hash type>])

The soundex function returns the soundex pattern for a word. A soundex pattern represents the sound of the word, and similar sounding words should have the same soundex pattern. Soundex patterns consist of an uppercase letter and 3 digits.

Expand All @@ -3917,17 +3917,19 @@ See also: soundslike()
4. All 0's are removed, because vowels are much less important than consonants in distinguishing words. "F16"
5. The string is padded with 0's or truncated to 4 characters. "F160"
That's it. It's not foolproof (enough = "E520", enuf = "E510") but it works pretty well. :)

The optional second argument can be 'soundex' (The default), for the transformation described above, or 'phone', for a different phonetic hash algorithm.
& SOUNDLIKE()
& SOUNDSLIKE()
soundslike(<word>, <word>)
soundlike(<word>, <word>)
soundslike(<word>, <word>[, <hash type>])
soundlike(<word>, <word>[, <hash type>])

The soundslike function returns 1 if the two words have the same hash code (see 'help soundex()' for information), which means, in general, if they sound alike. The hash type can be 'soundex' (Default) or 'phone' for a different algorithm that might give better results with some words.

The soundslike function returns 1 if the two words have the same soundex code (see 'help soundex()' for information), which means, in general, if they sound alike.

Examples:
> think soundslike(robin,robbyn)
1
> think soundslike(robin,roebuck)
> think soundslike(robin,roebuck, phone)
0

See also: soundex()
Expand Down
4 changes: 2 additions & 2 deletions src/function.c
Expand Up @@ -779,8 +779,8 @@ FUNTAB flist[] = {
{"SORT", fun_sort, 1, 4, FN_REG},
{"SORTBY", fun_sortby, 2, 4, FN_REG},
{"SORTKEY", fun_sortkey, 2, 5, FN_REG},
{"SOUNDEX", fun_soundex, 1, 1, FN_REG | FN_STRIPANSI},
{"SOUNDSLIKE", fun_soundlike, 2, 2, FN_REG | FN_STRIPANSI},
{"SOUNDEX", fun_soundex, 1, 2, FN_REG | FN_STRIPANSI},
{"SOUNDSLIKE", fun_soundlike, 2, 3, FN_REG | FN_STRIPANSI},
{"SPACE", fun_space, 1, 1, FN_REG | FN_STRIPANSI},
{"SPEAK", fun_speak, 2, 7, FN_REG},
{"SPELLNUM", fun_spellnum, 1, 1, FN_REG | FN_STRIPANSI},
Expand Down
151 changes: 77 additions & 74 deletions src/funmisc.c
Expand Up @@ -35,14 +35,15 @@
#include "gitinfo.h"
#include "tz.h"
#include "version.h"
#include "mushsql.h"
#include "charconv.h"

#ifdef WIN32
#include <windows.h>
#pragma warning(disable : 4761) /* NJG: disable warning re conversion */
#endif

extern FUN flist[];
static char *soundex(char *str);
extern char cf_motd_msg[BUFFER_LEN], cf_wizmotd_msg[BUFFER_LEN],
cf_downmotd_msg[BUFFER_LEN], cf_fullmotd_msg[BUFFER_LEN];
extern HASHTAB htab_function;
Expand Down Expand Up @@ -1174,75 +1175,64 @@ FUNCTION(fun_restarts) { safe_integer(globals.reboot_count, buff, bp); }

extern char soundex_val[UCHAR_MAX + 1];

/* The actual soundex routine */
static char *
soundex(char *str)
{
static char tbuf1[BUFFER_LEN];
char *p;

memset(tbuf1, '\0', 4);

p = tbuf1;

/* First character is just copied */
*p = UPCASE(*str);
str++;
/* Special case for PH->F */
if ((UPCASE(*p) == 'P') && *str && (UPCASE(*str) == 'H')) {
*p = 'F';
str++;
}
p++;
/* Convert letters to soundex values, squash duplicates, skip accents and
* other non-ascii characters */
while (*str) {
if (!isalpha(*str) || *str > 127) {
str++;
continue;
}
*p = soundex_val[*str++];
if (*p != *(p - 1))
p++;
}
*p = '\0';
/* Remove zeros */
p = str = tbuf1;
while (*str) {
if (*str != '0')
*p++ = *str;
str++;
}
*p = '\0';
/* Pad/truncate to 4 chars */
if (tbuf1[1] == '\0')
tbuf1[1] = '0';
if (tbuf1[2] == '\0')
tbuf1[2] = '0';
if (tbuf1[3] == '\0')
tbuf1[3] = '0';
tbuf1[4] = '\0';
return tbuf1;
enum sound_hash_type { HASH_SOUNDEX, HASH_PHONE };

char *
sound_hash(const char *str, int len, enum sound_hash_type type) {
sqlite3 *sqldb = get_shared_db();
sqlite3_stmt *hasher;
char *utf8, *result = NULL;
int ulen;
int status;

switch (type) {
case HASH_SOUNDEX:
/* Classic Penn soundex turns a leading ph into f. This makes
sense but isn't typical. */
hasher = prepare_statement(sqldb, "VALUES (soundex(CASE WHEN ?1 LIKE 'ph%' THEN printf('f%s', substr(?1, 3)) ELSE ?1 END))", "hash.soundex");
break;
case HASH_PHONE:
hasher = prepare_statement(sqldb, "VALUES (spellfix1_phonehash(?))",
"hash.phone");
break;
default:
return NULL;
}

utf8 = latin1_to_utf8(str, len, &ulen, "string");
sqlite3_bind_text(hasher, 1, utf8, ulen, free_string);
status = sqlite3_step(hasher);
if (status == SQLITE_ROW) {
result = mush_strdup((const char *)sqlite3_column_text(hasher, 0),
"string");
}
sqlite3_reset(hasher);
return result;
}

/* ARGSUSED */
FUNCTION(fun_soundex)
{
/* Returns the soundex code for a word. This 4-letter code is:
* 1. The first letter of the word (exception: ph -> f)
* 2. Replace each letter with a numeric code from the soundex table
* 3. Remove consecutive numbers that are the same
* 4. Remove 0's
* 5. Truncate to 4 characters or pad with 0's.
* It's actually a bit messier than that to make it faster.
*/
if (!args[0] || !*args[0] || !isalpha(*args[0]) || strchr(args[0], ' ')) {
safe_str(T("#-1 FUNCTION (SOUNDEX) REQUIRES A SINGLE WORD ARGUMENT"), buff,
bp);
return;
enum sound_hash_type type = HASH_SOUNDEX;
char *hashed;

if (nargs == 2) {
if (strcasecmp(args[1], "soundex") == 0) {
type = HASH_SOUNDEX;
} else if (strcasecmp(args[1], "phone") == 0) {
type = HASH_PHONE;
} else {
safe_str("#-1 UNKNOWN HASH TYPE", buff, bp);
return;
}
}
hashed = sound_hash(args[0], arglens[0], type);
if (hashed) {
safe_str(hashed, buff, bp);
mush_free(hashed, "string");
} else {
safe_str("#-1 HASH ERROR", buff, bp);
}
safe_str(soundex(args[0]), buff, bp);
return;
}

/* ARGSUSED */
Expand All @@ -1252,16 +1242,29 @@ FUNCTION(fun_soundlike)
* This can be optimized to go character-by-character, but
* I deem the modularity to be more important. So there.
*/
char tbuf1[5];
if (!*args[0] || !*args[1] || !isalpha(*args[0]) || !isalpha(*args[1]) ||
strchr(args[0], ' ') || strchr(args[1], ' ')) {
safe_str(T("#-1 FUNCTION (SOUNDLIKE) REQUIRES TWO ONE-WORD ARGUMENTS"),
buff, bp);
return;
}
/* soundex uses a static buffer, so we need to save it */
strcpy(tbuf1, soundex(args[0]));
safe_boolean(!strcmp(tbuf1, soundex(args[1])), buff, bp);
enum sound_hash_type type = HASH_SOUNDEX;
char *hash1, *hash2;

if (nargs == 3) {
if (strcasecmp(args[2], "soundex") == 0) {
type = HASH_SOUNDEX;
} else if (strcasecmp(args[2], "phone") == 0) {
type = HASH_PHONE;
} else {
safe_str("#-1 UNKNOWN HASH TYPE", buff, bp);
return;
}
}

hash1 = sound_hash(args[0], arglens[0], type);
hash2 = sound_hash(args[1], arglens[1], type);
if (!hash1 || !hash2) {
safe_str("#-1 HASH ERROR", buff, bp);
} else {
safe_boolean(strcmp(hash1, hash2) == 0, buff, bp);
}
mush_free(hash1, "string");
mush_free(hash2, "string");
}

/* ARGSUSED */
Expand Down
16 changes: 0 additions & 16 deletions src/tables.c
Expand Up @@ -62,22 +62,6 @@ char valid_ansi_codes[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char soundex_val[256] = {
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 49, 50, 51, 48, 49, 50, 48, 48, 50, 50,
52, 53, 53, 48, 49, 50, 54, 50, 51, 48, 49, 48, 50, 48, 50, 48, 48, 48, 48,
48, 48, 48, 49, 50, 51, 48, 49, 50, 48, 48, 50, 50, 52, 53, 53, 48, 49, 50,
54, 50, 51, 48, 49, 48, 50, 48, 50, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
48, 48, 48, 48, 48, 48, 48, 48, 48};

typedef struct {
const char *base;
const char *entity;
Expand Down
33 changes: 23 additions & 10 deletions test/testsoundex.pl
@@ -1,13 +1,26 @@
run tests:
test('soundex.1', $god, 'think soundex(a)', 'A000');
test('soundex.2', $god, 'think soundex(0)', '#-1 FUNCTION \(SOUNDEX\) REQUIRES A SINGLE WORD ARGUMENT');
test('soundex.3', $god, 'think soundex(fred)', 'F630');
test('soundex.4', $god, 'think soundex(phred)', 'F630');
test('soundex.5', $god, 'think soundex(afford)', 'A163');

test('soundslike.1', $god, 'think soundslike(robin, robbyn)', '1');
test('soundslike.2', $god, 'think soundslike(robin, roebuck)', '0');
test('soundslike.3', $god, 'think soundslike(frick, frack)', 1);
test('soundslike.4', $god, 'think soundslike(glacier, glazier)', 1);
test('soundslike.5', $god, 'think soundslike(rutabega, rototiller)', 0);
test('soundex.2', $god, 'think soundex(fred)', 'F630');
test('soundex.3', $god, 'think soundex(phred, soundex)', 'F630');
test('soundex.4', $god, 'think soundex(afford)', 'A163');

test('soundex.5', $god, 'think soundex(fred, phone)', '^BRD$');
test('soundex.6', $god, 'think soundex(phred, phone)', '^BRD$');
test('soundex.7', $god, 'think soundex(afford, phone)', '^ABRD$');

test('soundex.8', $god, 'think soundex(foo, bad hash)', '^#-1');

test('soundslike.1', $god, 'think soundslike(robin, robbyn)', '^1');
test('soundslike.2', $god, 'think soundslike(robin, roebuck)', '^0');
test('soundslike.3', $god, 'think soundslike(frick, frack)', '^1');
test('soundslike.4', $god, 'think soundslike(glacier, glazier)', '^1');
test('soundslike.5', $god, 'think soundslike(rutabega, rototiller, soundex)', '^0');

test('soundslike.6', $god, 'think soundslike(robin, robbyn, phone)', '^1');
test('soundslike.7', $god, 'think soundslike(robin, roebuck, phone)', '^0');
test('soundslike.8', $god, 'think soundslike(frick, frack, phone)', '^1');
test('soundslike.9', $god, 'think soundslike(glacier, glazier, phone)', '^1');
test('soundslike.10', $god, 'think soundslike(rutabega, rototiller, phone)', '^0');

test('soundslike.11', $god, 'think soundslike(foo, bar, bad hash)', '^#-1');

12 changes: 0 additions & 12 deletions utils/gentables.c
Expand Up @@ -118,17 +118,6 @@ char ansi_codes[UCHAR_MAX + 1] = {
['/'] = 1, ['a'] = 1
};

/* Values used in soundex hashing */
char soundex_codes[UCHAR_MAX + 1] = {
['B'] = 1, ['P'] = 1, ['F'] = 1, ['V'] = 1, ['b'] = 1, ['p'] = 1, ['f'] = 1, ['v'] = 1,
['C'] = 2, ['G'] = 2, ['J'] = 2, ['K'] = 2, ['Q'] = 2, ['S'] = 2, ['X'] = 2, ['Z'] = 2,
['c'] = 2, ['g'] = 2, ['j'] = 2, ['k'] = 2, ['q'] = 2, ['s'] = 2, ['x'] = 2, ['z'] = 2,
['D'] = 3, ['T'] = 3, ['d'] = 3, ['t'] = 3,
['L'] = 4, ['l'] = 4,
['M'] = 5, ['N'] = 5, ['m'] = 5, ['n'] = 5,
['R'] = 6, ['r'] = 6
};

/** Accented characters
*
* The table is for ISO 8859-1 character set.
Expand Down Expand Up @@ -289,7 +278,6 @@ int main(int argc, char *argv[]) {
print_table_bool("char", "valid_timefmt_codes", valid_timefmt_codes, 0);
print_table_bool("char", "escaped_chars", escaped_chars, 0);
print_table_bool("char", "valid_ansi_codes", ansi_codes, 0);
print_table_bool("char", "soundex_val", soundex_codes, '0');
print_entity_table("accent_table", entity_table);
return EXIT_SUCCESS;
}

0 comments on commit 61ace7e

Please sign in to comment.