Inittial checkin.

marcelotoledo · Jun 7, 2017 · ab8bc5a · ab8bc5a
commit ab8bc5a
Show file tree

Hide file tree

Showing 4 changed files with 559 additions and 0 deletions.
diff --git a/README.md b/README.md
diff --git a/spell.c b/spell.c
@@ -0,0 +1,342 @@
+/* 
+ * spell.c --- spell corrector
+ * 
+ * Copyright  (C)  2007  Marcelo Toledo <marcelo@marcelotoledo.com>
+ * 
+ * Version: 1.0
+ * Keywords: spell corrector
+ * Author: Marcelo Toledo <marcelo@marcelotoledo.com>
+ * Maintainer: Marcelo Toledo <marcelo@marcelotoledo.com>
+ * URL: http://marcelotoledo.com
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Commentary: 
+ * 
+ * See http://www.marcelotoledo.com.
+ * 
+ * Code:
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <search.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#define DICTIONARY "./big.txt"
+#define DICT_SZ    3000000
+
+const char delim[]    = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
+const char alphabet[] = "abcdefghijklmnopqrstuvwxyz";
+
+static char *strtolower(char *word)
+{
+        char *s;
+
+        for (s = word; *s; s++)
+                *s = tolower(*s);
+
+        return word;
+}
+
+static ENTRY *find(char *word)
+{
+        ENTRY e;
+
+        e.key = word;
+        return hsearch(e, FIND);
+}
+
+static int update(char *word)
+{
+        ENTRY *e = find(word);
+
+        if (!e)
+                return 0;
+
+        e->data++;
+
+        return 1;
+}
+
+static int read_file(ENTRY dict)
+{
+        char *file, *word, *w;
+        FILE *fp = fopen(DICTIONARY, "r");
+        struct stat sb;
+
+        if (!fp)
+                return 0;
+
+        if (stat(DICTIONARY, &sb))
+                return 0;
+
+        file = malloc(sb.st_size);
+        if (!file) {
+                fclose(fp);
+                return 0;
+        }
+
+        fread(file, sizeof(char), sb.st_size, fp);
+
+        word = strtok(file, delim);
+        while(word != NULL) {
+                w = strtolower(strdup(word));
+
+                if (!update(w)) {
+                        dict.key  = w;
+                        dict.data = 0;
+                        hsearch(dict, ENTER);
+                }
+
+                word = strtok(NULL, delim);
+        }
+
+        free(file);
+        fclose(fp);
+
+        return 1;
+}
+
+static char *substr(char *str, int offset, int limit)
+{
+        char *new_str;
+        int str_size = strlen(str);
+
+        if ((limit > str_size) || ((offset + limit) > str_size) || 
+            (str_size < 1) || (limit == 0))
+                return NULL;
+
+        new_str = malloc(limit+1 * sizeof(char));
+        if (!new_str)
+                return NULL;
+
+        strncpy(new_str, str+offset, limit);
+        *(new_str + limit) = '\0';
+
+        return new_str;
+}
+
+static char *concat(char *str1, char *str2)
+{
+        if (!str1) { 
+                str1 = malloc(sizeof(char));
+                *str1 = '\0';
+        }
+
+        if (!str2) { 
+                str2 = malloc(sizeof(char));
+                *str2 = '\0';
+        }
+
+        str1 = realloc(str1, strlen(str1) + strlen(str2) + 1);
+        return strcat(str1, str2);
+}
+
+static int deletion(char *word, char **array, int start_idx)
+{
+        int i, word_len = strlen(word);
+
+        for (i = 0; i < word_len; i++)
+                array[i + start_idx] = concat(substr(word, 0, i), substr(word, i+1, word_len-(i+1)));
+
+        return i;
+}
+
+static int transposition(char *word, char **array, int start_idx)
+{
+        int i, word_len = strlen(word);
+
+        for (i = 0; i < word_len-1; i++)
+                array[i + start_idx] = concat(concat(substr(word, 0, i), 
+                                                     substr(word, i+1, 1)), 
+                                              concat(substr(word, i, 1), 
+                                                     substr(word, i+2, word_len-(i+2))));
+
+        return i;
+}
+
+static int alteration(char *word, char **array, int start_idx)
+{
+        int i, j, k, word_len = strlen(word);
+        char c[2] = { 0, 0 };
+
+        for (i = 0, k = 0; i < word_len; i++)
+                for (j = 0; j < sizeof(alphabet); j++, k++) {
+                        c[0] = alphabet[j];
+                        array[start_idx + k] = concat(concat(substr(word, 0, i), (char *) &c), 
+                                                      substr(word, i+1, word_len - (i+1)));
+                }
+
+        return k;
+}
+
+static int insertion(char *word, char **array, int start_idx)
+{
+        int i, j, k, word_len = strlen(word);
+        char c[2] = { 0, 0 };
+
+        for (i = 0, k = 0; i <= word_len; i++)
+                for (j = 0; j < sizeof(alphabet); j++, k++) {
+                        c[0] = alphabet[j];
+                        array[start_idx + k] = concat(concat(substr(word, 0, i), (char *) &c), 
+                                                      substr(word, i, word_len - i));
+                }
+
+        return k;
+}
+
+static int edits1_rows(char *word)
+{
+        register int size = strlen(word);
+
+        return (size)                          + // deletion
+               (size - 1)                      + // transposition
+               (size * sizeof(alphabet))       + // alteration
+               (size + 1) * sizeof(alphabet);    // insertion
+}
+
+static char **edits1(char *word)
+{
+        int next_idx;
+        char **array = malloc(edits1_rows(word) * sizeof(char *));
+
+        if (!array)
+                return NULL;
+
+        next_idx  = deletion(word, array, 0);
+        next_idx += transposition(word, array, next_idx);
+        next_idx += alteration(word, array, next_idx);
+        insertion(word, array, next_idx);
+
+        return array;
+}
+
+static int array_exist(char **array, int rows, char *word)
+{
+        int i;
+
+        for (i = 0; i < rows; i++)
+                if (!strcmp(array[i], word))
+                        return 1;
+
+        return 0;
+}
+
+static char **known_edits2(char **array, int rows, int *e2_rows)
+{
+        int i, j, res_size, e1_rows;
+        char **res = NULL, **e1;
+
+        for (i = 0, res_size = 0; i < rows; i++) {
+                e1      = edits1(array[i]);
+                e1_rows = edits1_rows(array[i]);
+
+                for (j = 0; j < e1_rows; j++)
+                        if (find(e1[j]) && !array_exist(res, res_size, e1[j])) {
+                                res             = realloc(res, sizeof(char *) * (res_size + 1));
+                                res[res_size++] = e1[j];
+                        }
+        }
+
+        *e2_rows = res_size;
+
+        return res;
+}
+
+static char *max(char **array, int rows)
+{
+        char *max_word = NULL;
+        int i, max_size = 0;
+        ENTRY *e;
+
+        for (i = 0; i < rows; i++) {
+                e = find(array[i]);
+                if (e && ((int) e->data > max_size)) {
+                        max_size = (int) e->data;
+                        max_word = e->key;
+                }
+        }
+
+        return max_word;
+}
+
+static void array_cleanup(char **array, int rows)
+{
+        int i;
+
+        for (i = 0; i < rows; i++)
+                free(array[i]);
+}
+
+static char *correct(char *word)
+{
+        char **e1, **e2, *e1_word, *e2_word, *res_word = word;
+        int e1_rows, e2_rows;
+
+        if (find(word))
+                return word;
+
+        e1_rows = edits1_rows(word);
+        if (e1_rows) {
+                e1      = edits1(word);
+                e1_word = max(e1, e1_rows);
+
+                if (e1_word) {
+                        array_cleanup(e1, e1_rows);
+                        free(e1);
+                        return e1_word;
+                }
+        }
+
+        e2 = known_edits2(e1, e1_rows, &e2_rows);
+        if (e2_rows) {
+                e2_word = max(e2, e2_rows);
+                if (e2_word)
+                        res_word = e2_word;
+        }
+
+        array_cleanup(e1, e1_rows);
+        array_cleanup(e2, e2_rows);
+
+        free(e1);
+        free(e2);
+
+        return res_word;
+}
+
+int main(int argc, char **argv)
+{
+        char *corrected_word;
+        ENTRY dict;
+
+        hcreate(DICT_SZ);
+
+        if (!read_file(dict))
+                return -1;
+
+        corrected_word = correct(argv[1]);
+        if (strcmp(corrected_word, argv[1])) {
+                printf("Did you mean \"%s\"?\n", corrected_word);
+        } else {
+                printf("\"%s\" is correct!\n", argv[1]);
+        }
+
+        return 0;
+}
diff --git a/spell.py b/spell.py
@@ -0,0 +1,33 @@
+# 
+# By Peter Norvig see more @ http://norvig.com/spell-correct.html
+#
+
+import re, collections
+
+def words(text): return re.findall('[a-z]+', text.lower()) 
+
+def train(features):
+    model = collections.defaultdict(lambda: 1)
+    for f in features:
+        model[f] += 1
+    return model
+
+NWORDS = train(words(file('big.txt').read()))
+
+alphabet = 'abcdefghijklmnopqrstuvwxyz'
+
+def edits1(word):
+    n = len(word)
+    return set([word[0:i]+word[i+1:] for i in range(n)] +                     # deletion
+               [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # transposition
+               [word[0:i]+c+word[i+1:] for i in range(n) for c in alphabet] + # alteration
+               [word[0:i]+c+word[i:] for i in range(n+1) for c in alphabet])  # insertion
+
+def known_edits2(word):
+    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
+
+def known(words): return set(w for w in words if w in NWORDS)
+
+def correct(word):
+    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
+    return max(candidates, key=lambda w: NWORDS[w])