-
Notifications
You must be signed in to change notification settings - Fork 2
/
stopword.clj
50 lines (43 loc) · 1.54 KB
/
stopword.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
(ns ^{:doc "This namesapce provides ways of filtering *stop word* tokens.
To avoid the double negative in function names, *go words* are defined to be
the compliment of a vocabulary with a stop word list. Functions
like [[go-word?]] tell whether or not a token is a stop word, which are
defined to be:
* stopwords (predefined list)
* punctuation
* numbers
* non-alphabetic characters"
:author "Paul Landes"}
zensols.nlparse.stopword
(:require [clojure.string :as s]))
(def ^:dynamic *stopword-config*
"Configuration for filtering stop words.
Keys
---
* **:post-tags** POS tags for *go words* (see namespace docs)
* **:word-form-fn** function run on the token in [[go-word-form]]; for example
if `#(-> % :lemma s/lower-case)` is given then lemmatization is
used (i.e. Running -> run)"
{:pos-tags #{"RB", "JJ", "JJR", "JJS", "MD",
"NN", "NNS", "NNP", "NNPS",
"VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
"PRP", "PDT", "POS", "RP", "FW"}
:word-form-fn #(-> % :text s/lower-case)})
(defn go-word?
"Return whether a token is a *go* token."
[token]
(let [tags (:pos-tags *stopword-config*)]
(and (not (:stopword token))
(contains? tags (:pos-tag token)))))
(defn go-word-form
"Conical string word count form of a token."
[token]
((:word-form-fn *stopword-config*) token))
(defn go-word-forms
"Filter tokens per [[go-word?]] and return their *form*
based on [[go-word-form]]."
[tokens]
(->> tokens
(filter go-word?)
(remove nil?)
(map go-word-form)))