-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspellcheck.lisp
62 lines (52 loc) · 2.66 KB
/
spellcheck.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
;;;; Literal translation of Peter Norvig's spell corrector (http://norvig.com/spell-correct.html)
;;;; by Mikael Jansson <mikael@lisp.se>
;;;;
;;;; At some time, for some test case, on some version of SBCL, I find this to be 1e6 times faster than the Python version on a Quad Xeon 2,6 GHz.
;;;; I can no longer (2015) verify that claim, nor do I remember how I tested it, so the speedup is most likely invalid.
(defpackage :spellcheck
(:use :cl))
(in-package :spellcheck)
(ql:quickload "cl-ppcre")
(ql:quickload "alexandria")
(defun words (string)
(cl-ppcre:all-matches-as-strings "[a-z]+" string))
(defun train (words)
(let ((frequency (make-hash-table :test 'equal)))
(dolist (word words)
;; default 1 to make unknown words OK
(setf (gethash word frequency) (1+ (gethash word frequency 1))))
frequency))
(defvar *freq* (train (words (nstring-downcase (alexandria:read-file-into-string #P"big.txt")))))
(defvar *alphabet* "abcdefghijklmnopqrstuvwxyz")
;;; edits of one character
(defun edits-1 (word)
(let* ((splits (loop for i from 0 upto (length word)
collecting (cons (subseq word 0 i) (subseq word i))))
(deletes (loop for (a . b) in splits
when (not (zerop (length b)))
collect (concatenate 'string a (subseq b 1))))
(transposes (loop for (a . b) in splits
when (> (length b) 1)
collect (concatenate 'string a (subseq b 1 2) (subseq b 0 1) (subseq b 2))))
(replaces (loop for (a . b) in splits
nconcing (loop for c across *alphabet*
when (not (zerop (length b)))
collect (concatenate 'string a (string c) (subseq b 1)))))
(inserts (loop for (a . b) in splits
nconcing (loop for c across *alphabet*
collect (concatenate 'string a (string c) b)))))
(nconc deletes transposes replaces inserts)))
(defun known-edits-2 (word)
(loop for e1 in (edits-1 word) nconcing
(loop for e2 in (edits-1 e1)
when (multiple-value-bind (value pp) (gethash e2 *freq* 1) pp)
collect e2)))
(defun known (words)
(loop for word in words
when (multiple-value-bind (value pp) (gethash word *freq* 1) pp)
collect word))
(defun correct (word)
(loop for word in (or (known (list word)) (known (edits-1 word)) (known-edits-2 word) (list word))
maximizing (gethash word *freq* 1)
finally (return word)))
(time (loop for i from 1 to 1000000 do (correct "something")))