From 229a29470c879fc96b0d0a6dfef6f368cf9a0c75 Mon Sep 17 00:00:00 2001 From: aStereoID Date: Wed, 10 Apr 2024 11:42:15 +0200 Subject: [PATCH 1/6] Update resource symbols.csv --- src/main/resources/marytts/language/dsb/symbols.csv | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/resources/marytts/language/dsb/symbols.csv b/src/main/resources/marytts/language/dsb/symbols.csv index afaafba..dbd4cd0 100644 --- a/src/main/resources/marytts/language/dsb/symbols.csv +++ b/src/main/resources/marytts/language/dsb/symbols.csv @@ -77,13 +77,8 @@ $,dolarow =,rowna se >,wětše ako @,et -[,wótwórjona rožkata spinka \,beksleš -],zawrjena rožkata spinka ^,wušej -{,wótwórjona wuzgibnjona spinka -|,padorowna smužka -},zawrjena wuzgibnjona spinka ~,tilda °,stopnjow "°C",stopnjow Celsius From b51ad4f0777890fc9b1ac95dc9bf34d2658088a5 Mon Sep 17 00:00:00 2001 From: aStereoID Date: Wed, 10 Apr 2024 11:43:06 +0200 Subject: [PATCH 2/6] Add resource: abbreviations.csv --- .../marytts/language/dsb/abbreviations.csv | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 src/main/resources/marytts/language/dsb/abbreviations.csv diff --git a/src/main/resources/marytts/language/dsb/abbreviations.csv b/src/main/resources/marytts/language/dsb/abbreviations.csv new file mode 100644 index 0000000..eafbedd --- /dev/null +++ b/src/main/resources/marytts/language/dsb/abbreviations.csv @@ -0,0 +1,128 @@ +"kwart.","kwartal" +"kw.","kwartal" +"mjas.","mjasac" +"tyź.","tyźeń" +"góź.","góźina" +"min.","minutow" +"sek.","sekundow" +"pś.Kr.n.","pśed Kristusowym naroźenim" +"pś.n.l.c.","pśed našym licenim casa" +"pó Kr.n.","pó Kristusowym naroźenim" +"n.l.c.","našogo licenja casa" +"jan.","januar" +"feb.","februar" +"měr.","měrc" +"apr.","apryl" +"jun.","junij" +"jul.","julij" +"awg.","awgust" +"sep.","september" +"okt.","oktober" +"now.","nowember" +"dec.","december" +"dop.","dopołdnja" +"połd.","połdnjo" +"wótp.","wótpołdnja" +"nje.","njeźela" +"pón.","pónjeźele" +"wał.","wałtora" +"srj.","srjoda" +"stw.","stwórtk" +"pě.","pětk" +"sob.","sobota" +"nj.","njeźela" +"pó.","pónjeźele" +"sr.","wu." +"st.","stwórtk" +"tys.","tysac" +"mil.","milionow" +"mrd.","miliardow" +"bil.","bilionow" +"mio.","milionow" +"PLN","pólskich złotych" +"zł","pólskich złotych" +"Kč","českich kronow" +"EUR","eurow" +"DKK","dańskich kronow" +"DM","markow" +"CHF","šwicaŕskich frankow" +"CZK","českich krónow" +"HUF","hungorskich forintow" +"PLZ","pej el cet" +"RUB","rusojskich rublow" +"CNY","chinskich yuanow" +"CN¥","chinskich yuanow" +"JPY","japańskich yenow" +"AUD","awstralskich dolarow" +"NZ$","nowoseelandskich dolarow" +"dn.","dnjow" +"dn.","dnjow" +"cm","centimetrow" +"dm","decimetrow" +"ft","stopow" +"km","kilometrow" +"m","metrow" +"μm","mikrometrow" +"mm","milimetrow" +"nm","nanometrow" +"nmi","nawtiskich milow" +"cm²"," kwadratnych centimetrow" +"ft²","kwadratnych stopow" +"in²","kwadratnych cólow" +"km²","kwadratnych kilometrow" +"m²","kwadratnych metrow" +"mi²","kwadratnych milow" +"yd²","kwadratnych yardow" +"cm³","kubiknych centimetrow" +"ft³","kubiknych stopow" +"in³","kubiknych cólow" +"km³","kubiknych kilometrow" +"m³","kubiknych metrow" +"mi³","kubiknych milow" +"yd³","kubiknych yardow" +"fl. oz.","žydkich uncow" +"łž.","łžycow" +"łžk.","łžyckow" +"m/s²","metrow na kwadratnu sekundu" +"km/h","kilometrow na góźinu" +"m/s","metrow na sekundu" +"mph","milow na góźinu" +"kg","kilogramow" +"µg","mikrogramow" +"mg","miligramow" +"oz","uncow" +"oz. tr.","trojskich uncow" +"lb","puntow" +"cal","kalorijow" +"kcal","kilokalorijow" +"kJ","kilodžulow" +"kWh","kilowattowych góźin" +"GW","gigawattow" +"PS","kónjecych mócow" +"kW","kilowattow" +"MW","megawattow" +"mW","miliwattow" +"mA","milliamperow" +"Ω","ohmow" +"GHz","gigahertzow" +"Hz","hertzow" +"kHz","kilohertzow" +"MHz","megahertzow" +"hPa","hektopascalow" +"inHg","cólow žywoslobrowego stołpika" +"mbar","milibarow" +"mm Hg","milimetrow žywoslobrowego stołpika" +"°C","stopnjow Celsiusa" +"°F","stopnjow Fahrenheita" +"bit","bitow" +"byte","byteow" +"Gb","gigabitow" +"GB","gigabyteow" +"kb","kilobitow" +"kB","kilobyteow" +"Mb","megabitow" +"MB","megabyteow" +"Tb","terabitow" +"TB","terabyteow" +"l/km","litrow na kilometer" +"mpg","milow na galonu" From 810d7e2852f0b19bbc0608e03666505e61e92fbd Mon Sep 17 00:00:00 2001 From: Ingmar Steiner Date: Tue, 21 May 2024 16:55:31 +0200 Subject: [PATCH 3/6] Init abbreviation integration test --- .../marytts/language/dsb/PreprocessIT.groovy | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/integrationTest/groovy/marytts/language/dsb/PreprocessIT.groovy b/src/integrationTest/groovy/marytts/language/dsb/PreprocessIT.groovy index 972a9dd..6070053 100644 --- a/src/integrationTest/groovy/marytts/language/dsb/PreprocessIT.groovy +++ b/src/integrationTest/groovy/marytts/language/dsb/PreprocessIT.groovy @@ -46,4 +46,18 @@ class PreprocessIT { } assert actual == expected } + + @Test + void 'Given input with abbreviations, When text is converted to words, Then abbreviations are expanded correctly'() { + def input = 'GHz l/km mpg cm³.' + def expected = 'gigahertzow litrow na kilometer milow na galonu kubiknych centimetrow.' + def output = mary.generateXML(input) + def outputStr = output.documentElement.serialize() + def xmlSlurper = new XmlSlurper(false, false) + def tokens = xmlSlurper.parseText(outputStr).depthFirst().findAll { it.name() == 't' } + def actual = tokens.inject('') { result, token -> + (result.isEmpty() || token ==~ /\p{Punct}/) ? result + token : result + ' ' + token + } + assert actual == expected + } } From 8611a264d7539e980b1a428c60e570f25d86421a Mon Sep 17 00:00:00 2001 From: aStereoID Date: Wed, 10 Apr 2024 12:20:17 +0200 Subject: [PATCH 4/6] Add support for abbreviations expansion --- .../java/marytts/language/dsb/Preprocess.java | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/main/java/marytts/language/dsb/Preprocess.java b/src/main/java/marytts/language/dsb/Preprocess.java index c9a4de9..17d66ff 100644 --- a/src/main/java/marytts/language/dsb/Preprocess.java +++ b/src/main/java/marytts/language/dsb/Preprocess.java @@ -31,6 +31,7 @@ public class Preprocess extends InternalModule { static final ULocale locale = new ULocale.Builder().setLanguage("dsb").build(); + private Map abbreviations; private Map symbols; private RuleBasedNumberFormat ruleBasedNumberFormat; private NumberFormat numberFormat; @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException { super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale()); initNumberExpansion("formatRules.txt"); initSymbolExpansion("symbols.csv"); + initAbbreviationExpansion("abbreviations.csv"); + } + + private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException { + try { + abbreviations = new HashMap<>(); + InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName); + InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8); + CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT) + .setHeader("abbreviation", "expansion") + .build() + .parse(abbreviationsReader); + for (CSVRecord record : csv) { + String abbreviation = record.get("abbreviation"); + String expansion = record.get("expansion"); + abbreviations.put(abbreviation, expansion); + } + } catch (Exception exception) { + throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception); + } } private void initSymbolExpansion(String resourceName) throws MaryConfigurationException { @@ -73,6 +94,7 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx public MaryData process(MaryData d) { Document doc = d.getDocument(); + expandAllAbbreviations(doc); expandAllSymbols(doc); expandAllNumbers(doc); MaryData result = new MaryData(getOutputType(), d.getLocale()); @@ -80,6 +102,26 @@ public MaryData process(MaryData d) { return result; } + private void expandAllAbbreviations(Document document) { + TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT, + new NameNodeFilter(MaryXML.TOKEN), false); + Element token; + while ((token = (Element) treeWalker.nextNode()) != null) { + String tokenText = MaryDomUtils.tokenText(token); + String expandedAbbreviation = expandAbbreviation(tokenText); + if (expandedAbbreviation != tokenText) { + MaryDomUtils.setTokenText(token, expandedAbbreviation); + } + } + } + + protected String expandAbbreviation(String abbreviation) { + if (symbols.containsKey(abbreviation)) + return abbreviations.get(abbreviation); + else + return abbreviation; + } + private void expandAllSymbols(Document document) { TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); @@ -126,4 +168,4 @@ protected String spelloutNumber(Number number) { return ruleBasedNumberFormat.format(number); } -} +} \ No newline at end of file From db9da4412b1f972e3da2fc745c9375db3c0774a6 Mon Sep 17 00:00:00 2001 From: Ingmar Steiner Date: Tue, 21 May 2024 17:18:01 +0200 Subject: [PATCH 5/6] Fix Preprocess initialization --- src/main/java/marytts/language/dsb/Preprocess.java | 2 +- src/main/resources/marytts/language/dsb/abbreviations.csv | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/marytts/language/dsb/Preprocess.java b/src/main/java/marytts/language/dsb/Preprocess.java index 17d66ff..d016816 100644 --- a/src/main/java/marytts/language/dsb/Preprocess.java +++ b/src/main/java/marytts/language/dsb/Preprocess.java @@ -116,7 +116,7 @@ private void expandAllAbbreviations(Document document) { } protected String expandAbbreviation(String abbreviation) { - if (symbols.containsKey(abbreviation)) + if (abbreviations.containsKey(abbreviation)) return abbreviations.get(abbreviation); else return abbreviation; diff --git a/src/main/resources/marytts/language/dsb/abbreviations.csv b/src/main/resources/marytts/language/dsb/abbreviations.csv index eafbedd..e852c14 100644 --- a/src/main/resources/marytts/language/dsb/abbreviations.csv +++ b/src/main/resources/marytts/language/dsb/abbreviations.csv @@ -1,3 +1,4 @@ +"abbreviation","expansion" "kwart.","kwartal" "kw.","kwartal" "mjas.","mjasac" From f8e1ac509ae9bae465e01630ef0eaf92f7852bd4 Mon Sep 17 00:00:00 2001 From: Ingmar Steiner Date: Mon, 27 May 2024 19:15:10 +0200 Subject: [PATCH 6/6] Update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a79c4b..8cf9236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ Lower Sorbian language component for MaryTTS [Unreleased] ------------ +### Added + +- Abbreviation expansion in preprocessing + ### Changed - Upgraded lexicon to v0.2.0