From 83c78559bd3edd90c6f52461754d1265af824b5d Mon Sep 17 00:00:00 2001 From: aStereoID Date: Wed, 10 Apr 2024 12:20:17 +0200 Subject: [PATCH] Add support for abbreviations expansion --- .../java/marytts/language/dsb/Preprocess.java | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/main/java/marytts/language/dsb/Preprocess.java b/src/main/java/marytts/language/dsb/Preprocess.java index c9a4de9..17d66ff 100644 --- a/src/main/java/marytts/language/dsb/Preprocess.java +++ b/src/main/java/marytts/language/dsb/Preprocess.java @@ -31,6 +31,7 @@ public class Preprocess extends InternalModule { static final ULocale locale = new ULocale.Builder().setLanguage("dsb").build(); + private Map abbreviations; private Map symbols; private RuleBasedNumberFormat ruleBasedNumberFormat; private NumberFormat numberFormat; @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException { super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale()); initNumberExpansion("formatRules.txt"); initSymbolExpansion("symbols.csv"); + initAbbreviationExpansion("abbreviations.csv"); + } + + private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException { + try { + abbreviations = new HashMap<>(); + InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName); + InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8); + CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT) + .setHeader("abbreviation", "expansion") + .build() + .parse(abbreviationsReader); + for (CSVRecord record : csv) { + String abbreviation = record.get("abbreviation"); + String expansion = record.get("expansion"); + abbreviations.put(abbreviation, expansion); + } + } catch (Exception exception) { + throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception); + } } private void initSymbolExpansion(String resourceName) throws MaryConfigurationException { @@ -73,6 +94,7 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx public MaryData process(MaryData d) { Document doc = d.getDocument(); + expandAllAbbreviations(doc); expandAllSymbols(doc); expandAllNumbers(doc); MaryData result = new MaryData(getOutputType(), d.getLocale()); @@ -80,6 +102,26 @@ public MaryData process(MaryData d) { return result; } + private void expandAllAbbreviations(Document document) { + TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT, + new NameNodeFilter(MaryXML.TOKEN), false); + Element token; + while ((token = (Element) treeWalker.nextNode()) != null) { + String tokenText = MaryDomUtils.tokenText(token); + String expandedAbbreviation = expandAbbreviation(tokenText); + if (expandedAbbreviation != tokenText) { + MaryDomUtils.setTokenText(token, expandedAbbreviation); + } + } + } + + protected String expandAbbreviation(String abbreviation) { + if (symbols.containsKey(abbreviation)) + return abbreviations.get(abbreviation); + else + return abbreviation; + } + private void expandAllSymbols(Document document) { TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); @@ -126,4 +168,4 @@ protected String spelloutNumber(Number number) { return ruleBasedNumberFormat.format(number); } -} +} \ No newline at end of file