/
MarkovChainBuilder.java
136 lines (104 loc) · 3.49 KB
/
MarkovChainBuilder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
package ch.retorte.textsynthesizor.builder;
import static com.google.common.base.Joiner.on;
import static com.google.common.collect.Lists.newArrayList;
import java.util.List;
import ch.retorte.textsynthesizor.model.NGram;
import ch.retorte.textsynthesizor.model.NGramFactory;
import ch.retorte.textsynthesizor.model.Token;
import ch.retorte.textsynthesizor.model.TokenFactory;
import ch.retorte.textsynthesizor.tokenizer.Tokenizer;
import ch.retorte.textsynthesizor.utils.RingBuffer;
import com.google.common.annotations.VisibleForTesting;
/**
* The {@link MarkovChainBuilder} manages to put all tokens together to the desired n-grams and puts them into the {@link Chain} appropriately. It then
* constitutes the synthesized text based on these probabilities.
*
* @author nw
*/
public class MarkovChainBuilder {
private static final String EMPTY_STRING = "";
private final Tokenizer tokenizer;
private List<Token> tokens;
private RingBuffer<Token> buffer;
private Chain chain = new Chain();
private NGramFactory nGramFactory = new NGramFactory();
private TokenFactory tokenFactory = new TokenFactory();
public MarkovChainBuilder(Tokenizer tokenizer) {
this.tokenizer = tokenizer;
}
public String generateTextWith(int nGram, int outputTokens, String input) {
generateTokensOf(input);
buildProbabilityTableWith(nGram);
initializeBufferWithSize(nGram);
return generateStringOfLength(outputTokens);
}
@VisibleForTesting
void generateTokensOf(String input) {
tokens = tokenizer.tokenize(input);
}
@VisibleForTesting
void buildProbabilityTableWith(int n) {
List<Token> inputList = prepareTokenList(getTokens(), n);
for (int i = 0; i < inputList.size() - n; i++) {
List<Token> nGramTokens = inputList.subList(i, i + n);
Token nextToken = inputList.get(i + n);
addToChain(nGramTokens, nextToken);
}
}
@VisibleForTesting
List<Token> getTokens() {
return tokens;
}
@VisibleForTesting
void addToChain(List<Token> nGramTokens, Token nextToken) {
chain.add(createNGramFrom(nGramTokens), nextToken);
}
@VisibleForTesting
NGram createNGramFrom(List<Token> nGramTokens) {
return nGramFactory.createFrom(nGramTokens);
}
/**
* Prepares the token list for processing, that is, copies the tokens to a new list which also contains n empty tokens in front of every 'first' token.
*/
@VisibleForTesting
List<Token> prepareTokenList(List<Token> tokenList, int n) {
List<Token> inputList = newArrayList();
for (Token t : tokenList) {
if (t.isFirst()) {
for (int i = 0; i < n; i++) {
inputList.add(tokenFactory.createEmptyToken());
}
}
inputList.add(t);
}
return inputList;
}
@VisibleForTesting
void initializeBufferWithSize(int n) {
buffer = new RingBuffer<Token>(n);
}
@VisibleForTesting
String generateStringOfLength(int tokens) {
if (isChainEmpty()) {
return EMPTY_STRING;
}
List<String> result = newArrayList();
while (result.size() < tokens) {
Token nextToken = extractNextToken();
if (!nextToken.isEmpty()) {
result.add(nextToken.getContent());
}
buffer.addItem(nextToken);
}
return on(tokenizer.getDelimiter()).join(result);
}
@VisibleForTesting
boolean isChainEmpty() {
return chain.isEmpty();
}
@VisibleForTesting
Token extractNextToken() {
NGram latestNGram = nGramFactory.createFrom(buffer.getLatestItems());
return chain.getNextTokenFor(latestNGram);
}
}