/
segment.clj
48 lines (40 loc) · 1.87 KB
/
segment.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
(ns nlputils.segment)
;; http://www.mieliestronk.com/wordlist.html
(def DICT (into #{} (re-seq #"[\w']+" (.toLowerCase (slurp "res/words.txt")))))
(defn segment-min [string dict]
"Perform string segmentation by dictionary with minimum matching algorithm."
(loop [s string idx 1 res []]
(cond (empty? s) res
(< (count s) idx) []
:else (let [curstr (subs s 0 idx)]
(if (contains? dict curstr)
(recur (subs s idx) 1 (conj res curstr))
(recur s (inc idx) res))))))
(defn segment-max [string dict]
"Perform string segmentation by dictionary with maximum matching algorithm."
(loop [s string idx (count s) res []]
(cond (empty? s) res
(< idx 0) []
:else (let [curstr (subs s 0 idx)]
(if (contains? dict curstr)
(recur (subs s idx) (- (count s) idx) (conj res curstr))
(recur s (dec idx) res))))))
(defn- split-string-at [string idx]
[(subs string 0 idx)
(subs string idx)])
(defn- contains-all? [words dict]
(reduce #(and %1 %2) (map #(contains? dict %) words)))
(defn- word-candidates-all [string dict]
(if (empty? string) [""]
(apply concat (for [i (range 1 (inc (count string)))]
(let [[a b] (split-string-at string i)]
(map #(cons a %) (word-candidates-all b dict)))))))
(defn segment-bruteforce [string dict]
"Perform string segmentation by dictionary with bruteforce"
(filter #(contains-all? % dict) (word-candidates-all string dict)))
(defn segment-bruteforce-clever [string dict]
(if (empty? string) [""]
(apply concat (for [i (range 1 (inc (count string)))
:let [ss (subs string 0 i)]
:when (contains? dict ss)]
(map #(cons ss %) (segment-bruteforce-clever (subs string i) dict))))))