-
Notifications
You must be signed in to change notification settings - Fork 727
/
TextToMaryXML.java
133 lines (119 loc) · 4.94 KB
/
TextToMaryXML.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/**
* Copyright 2000-2006 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.modules;
import java.util.Locale;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.server.MaryProperties;
import marytts.util.MaryUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
/**
* Embed plain text input into a raw (untokenised) MaryXML document.
*
* @author Marc Schröder
*/
public class TextToMaryXML extends InternalModule
{
private DocumentBuilderFactory factory = null;
private DocumentBuilder docBuilder = null;
private boolean splitIntoParagraphs;
public TextToMaryXML()
{
super("TextToMaryXML",
MaryDataType.TEXT, MaryDataType.RAWMARYXML, null);
splitIntoParagraphs = MaryProperties.getBoolean("texttomaryxml.splitintoparagraphs");
}
public void startup() throws Exception
{
if (factory == null) {
factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
}
if (docBuilder == null) {
docBuilder = factory.newDocumentBuilder();
}
super.startup();
}
public MaryData process(MaryData d)
throws Exception
{
String plainText = MaryUtils.normaliseUnicodePunctuation(d.getPlainText());
MaryData result = new MaryData(outputType(), d.getLocale(), true);
Document doc = result.getDocument();
Element root = doc.getDocumentElement();
Locale l = determineLocale(plainText, d.getLocale());
root.setAttribute("xml:lang", MaryUtils.locale2xmllang(l));
if (splitIntoParagraphs) { // Empty lines separate paragraphs
String[] inputTexts = plainText.split("\\n(\\s*\\n)+");
for (int i=0; i<inputTexts.length; i++) {
String paragraph = inputTexts[i].trim();
if (paragraph.length() == 0) continue;
appendParagraph(paragraph, root, d.getLocale());
}
} else { // The whole text as one single paragraph
appendParagraph(plainText, root, d.getLocale());
}
result.setDocument(doc);
return result;
}
/**
* Append one paragraph of text to the rawmaryxml document. If the text
* language (as determined by #getLanguage(text)) differs from the
* enclosing document's language, the paragraph element is enclosed with a
* <code><voice xml:lang="..."></code> element.
* @param text the paragraph text.
* @param root the root node of the rawmaryxml document, where to insert
* the paragraph.
* @param defaultLocale the default locale, in case the language of the text cannot be determined.
*/
private void appendParagraph(String text, Element root, Locale defaultLocale) {
Element insertHere = root;
String rootLanguage = root.getAttribute("xml:lang");
String textLanguage = MaryUtils.locale2xmllang(determineLocale(text, defaultLocale));
if (!textLanguage.equals(rootLanguage)) {
Element voiceElement = MaryXML.appendChildElement(root, MaryXML.VOICE);
voiceElement.setAttribute("xml:lang", textLanguage);
insertHere = voiceElement;
}
insertHere = MaryXML.appendChildElement(insertHere, MaryXML.PARAGRAPH);
// Now insert the entire plain text as a single text node
insertHere.appendChild(root.getOwnerDocument().createTextNode(text));
// And, for debugging, read it:
Text textNode = (Text) insertHere.getFirstChild();
String textNodeString = textNode.getData();
logger.debug("textNodeString=`" + textNodeString + "'");
}
/**
* Try to determine the locale of the given text. This implementation simply returns the default locale;
* subclasses can try to do something fancy here.
* @param text the text whose locale to determine
* @param defaultLocale the default locale of the document.
* @return the locale as inferred from the text and the default locale
*/
protected Locale determineLocale(String text, Locale defaultLocale)
{
return defaultLocale;
}
}