/
core.clj
98 lines (83 loc) · 3.86 KB
/
core.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
(ns dj-marky-markov.core
"A simple Markov Chain for text generation in Clojure"
(:require [clojure.string :as cs])
(:gen-class))
(def first-capital-letter-regex
"Regex pattern to find words that start sentences"
(re-pattern "^[A-Z]"))
(def punctuation-regex
"This is a loose regex detection for string terminators. Needs to be more robust"
(re-pattern "[\\.\\?\\!]"))
(defn concat-with-space
"Return `s1` concatenated with `s2` with a space between the two strings"
[s1 s2]
(str s1 " " s2))
(defn search-text
"Take `window-length` words from the end of `sentence`
to use as the look-up for the next markov transition"
[stem window-length]
(let [trailing-part (take-last window-length (cs/split stem #"\s"))]
(cs/trim (reduce concat-with-space trailing-part))))
(defn string-to-sliding-window
"Convert `string` into a series of sliding windows that will be sliced into
a window and a transition word, hence incrementing the partition window."
[string window-length]
(partition (inc window-length) 1 (cs/split string #"\s+")))
(defn single-window-to-tuple
"Split `window` into a look-up and a transition tuple"
[window]
(cons (reduce concat-with-space (butlast window)) (list (last window))))
(defn add-entry
"Update `dictionary` with an `entry` tuple.
If the look-up already exists, cons the current transition to the list of transitions"
[dictionary entry]
(update dictionary (first entry) #(cons (second entry) %)))
(defn build-markov-dictionary
"Collapse `tuple-set` into a look-up dictionary"
[tuple-set]
(reduce add-entry {} tuple-set))
(defn starts-sentence?
"Boolean predicate that returns true iff the first word of `text-window`
starts with a capital letter"
[text-window]
(boolean (re-find first-capital-letter-regex (first text-window))))
(defn sentence-ended?
"Boolean predicate that returns true iff `text` contains terminal punctuation"
[text]
(boolean (re-find punctuation-regex text)))
(defn markov-sentence
"Build a sentence starting with a random element in `sentence-starters`, and
recursively navigate the markov model in `sentence-bodies`, whose transition windows
are sized by `window-length`"
[sentence-starters sentence-bodies window-length]
(loop [sentence (rand-nth (keys sentence-starters))]
(if (sentence-ended? sentence)
;; We want to trim off anything that trails after punctuation
;; e.g. "I went to the store. Hello" -> "I went to the store."
(str (first (cs/split sentence punctuation-regex))
(re-find punctuation-regex sentence))
(let [look-up (search-text sentence window-length)
added-text (rand-nth (get sentence-bodies look-up ["."]))]
(recur (concat-with-space sentence added-text))))))
(defn markov-sentences
"Generate `sentences` repeatedly"
[sentence-starters sentence-bodies window-length sentences]
(take sentences (repeatedly #(markov-sentence sentence-starters sentence-bodies window-length))))
(defn load-data!
"Load the file at `path` and split it into tuples of size `window-length`"
[path window-length]
(map single-window-to-tuple (string-to-sliding-window (slurp path) window-length)))
(defn write-sentences
"Generate `copies` sentences from the `tuples` where each transition window is `window-length`"
[tuples window-length copies]
(let [sentence-starters (build-markov-dictionary (get (group-by starts-sentence? tuples) true))
sentence-bodies (build-markov-dictionary tuples)]
(markov-sentences sentence-starters sentence-bodies window-length copies)))
(defn -main
"Try me out!"
[& args]
(let [window-length (Integer/parseInt (second args))
tuples (load-data! (first args) window-length)
results-to-generate (Integer/parseInt (nth args 2))
generated-text (write-sentences tuples window-length results-to-generate)]
(doseq [sentence generated-text] (println sentence))))