Skip to content

Commit

Permalink
Add support for abbreviations expansion
Browse files Browse the repository at this point in the history
  • Loading branch information
aStereoID committed Apr 10, 2024
1 parent 1a0cb12 commit 83c7855
Showing 1 changed file with 43 additions and 1 deletion.
44 changes: 43 additions & 1 deletion src/main/java/marytts/language/dsb/Preprocess.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
public class Preprocess extends InternalModule {

static final ULocale locale = new ULocale.Builder().setLanguage("dsb").build();
private Map<String, String> abbreviations;
private Map<String, String> symbols;
private RuleBasedNumberFormat ruleBasedNumberFormat;
private NumberFormat numberFormat;
Expand All @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale());
initNumberExpansion("formatRules.txt");
initSymbolExpansion("symbols.csv");
initAbbreviationExpansion("abbreviations.csv");
}

private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException {
try {
abbreviations = new HashMap<>();
InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName);
InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8);
CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT)
.setHeader("abbreviation", "expansion")
.build()
.parse(abbreviationsReader);
for (CSVRecord record : csv) {
String abbreviation = record.get("abbreviation");
String expansion = record.get("expansion");
abbreviations.put(abbreviation, expansion);
}
} catch (Exception exception) {
throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception);
}
}

private void initSymbolExpansion(String resourceName) throws MaryConfigurationException {
Expand Down Expand Up @@ -73,13 +94,34 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx

public MaryData process(MaryData d) {
Document doc = d.getDocument();
expandAllAbbreviations(doc);
expandAllSymbols(doc);
expandAllNumbers(doc);
MaryData result = new MaryData(getOutputType(), d.getLocale());
result.setDocument(doc);
return result;
}

private void expandAllAbbreviations(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element token;
while ((token = (Element) treeWalker.nextNode()) != null) {
String tokenText = MaryDomUtils.tokenText(token);
String expandedAbbreviation = expandAbbreviation(tokenText);
if (expandedAbbreviation != tokenText) {
MaryDomUtils.setTokenText(token, expandedAbbreviation);
}
}
}

protected String expandAbbreviation(String abbreviation) {
if (symbols.containsKey(abbreviation))
return abbreviations.get(abbreviation);
else
return abbreviation;
}

private void expandAllSymbols(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Expand Down Expand Up @@ -126,4 +168,4 @@ protected String spelloutNumber(Number number) {
return ruleBasedNumberFormat.format(number);
}

}
}

0 comments on commit 83c7855

Please sign in to comment.