Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
snippets/lisp/spellcheck/spellcheck.lisp
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
62 lines (52 sloc)
2.66 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;;; Literal translation of Peter Norvig's spell corrector (http://norvig.com/spell-correct.html) | |
;;;; by Mikael Jansson <mikael@lisp.se> | |
;;;; | |
;;;; At some time, for some test case, on some version of SBCL, I find this to be 1e6 times faster than the Python version on a Quad Xeon 2,6 GHz. | |
;;;; I can no longer (2015) verify that claim, nor do I remember how I tested it, so the speedup is most likely invalid. | |
(defpackage :spellcheck | |
(:use :cl)) | |
(in-package :spellcheck) | |
(ql:quickload "cl-ppcre") | |
(ql:quickload "alexandria") | |
(defun words (string) | |
(cl-ppcre:all-matches-as-strings "[a-z]+" string)) | |
(defun train (words) | |
(let ((frequency (make-hash-table :test 'equal))) | |
(dolist (word words) | |
;; default 1 to make unknown words OK | |
(setf (gethash word frequency) (1+ (gethash word frequency 1)))) | |
frequency)) | |
(defvar *freq* (train (words (nstring-downcase (alexandria:read-file-into-string #P"big.txt"))))) | |
(defvar *alphabet* "abcdefghijklmnopqrstuvwxyz") | |
;;; edits of one character | |
(defun edits-1 (word) | |
(let* ((splits (loop for i from 0 upto (length word) | |
collecting (cons (subseq word 0 i) (subseq word i)))) | |
(deletes (loop for (a . b) in splits | |
when (not (zerop (length b))) | |
collect (concatenate 'string a (subseq b 1)))) | |
(transposes (loop for (a . b) in splits | |
when (> (length b) 1) | |
collect (concatenate 'string a (subseq b 1 2) (subseq b 0 1) (subseq b 2)))) | |
(replaces (loop for (a . b) in splits | |
nconcing (loop for c across *alphabet* | |
when (not (zerop (length b))) | |
collect (concatenate 'string a (string c) (subseq b 1))))) | |
(inserts (loop for (a . b) in splits | |
nconcing (loop for c across *alphabet* | |
collect (concatenate 'string a (string c) b))))) | |
(nconc deletes transposes replaces inserts))) | |
(defun known-edits-2 (word) | |
(loop for e1 in (edits-1 word) nconcing | |
(loop for e2 in (edits-1 e1) | |
when (multiple-value-bind (value pp) (gethash e2 *freq* 1) pp) | |
collect e2))) | |
(defun known (words) | |
(loop for word in words | |
when (multiple-value-bind (value pp) (gethash word *freq* 1) pp) | |
collect word)) | |
(defun correct (word) | |
(loop for word in (or (known (list word)) (known (edits-1 word)) (known-edits-2 word) (list word)) | |
maximizing (gethash word *freq* 1) | |
finally (return word))) | |
(time (loop for i from 1 to 1000000 do (correct "something"))) | |