Skip to content

Commit

Permalink
Merge pull request #18 from psibre/v2
Browse files Browse the repository at this point in the history
Add abbreviation support
  • Loading branch information
psibre committed May 29, 2024
2 parents 5c67574 + f8e1ac5 commit f2c06c8
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Lower Sorbian language component for MaryTTS
[Unreleased]
------------

### Added

- Abbreviation expansion in preprocessing

### Changed

- Upgraded lexicon to v0.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,18 @@ class PreprocessIT {
}
assert actual == expected
}

@Test
void 'Given input with abbreviations, When text is converted to words, Then abbreviations are expanded correctly'() {
def input = 'GHz l/km mpg cm³.'
def expected = 'gigahertzow litrow na kilometer milow na galonu kubiknych centimetrow.'
def output = mary.generateXML(input)
def outputStr = output.documentElement.serialize()
def xmlSlurper = new XmlSlurper(false, false)
def tokens = xmlSlurper.parseText(outputStr).depthFirst().findAll { it.name() == 't' }
def actual = tokens.inject('') { result, token ->
(result.isEmpty() || token ==~ /\p{Punct}/) ? result + token : result + ' ' + token
}
assert actual == expected
}
}
44 changes: 43 additions & 1 deletion src/main/java/marytts/language/dsb/Preprocess.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
public class Preprocess extends InternalModule {

static final ULocale locale = new ULocale.Builder().setLanguage("dsb").build();
private Map<String, String> abbreviations;
private Map<String, String> symbols;
private RuleBasedNumberFormat ruleBasedNumberFormat;
private NumberFormat numberFormat;
Expand All @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale());
initNumberExpansion("formatRules.txt");
initSymbolExpansion("symbols.csv");
initAbbreviationExpansion("abbreviations.csv");
}

private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException {
try {
abbreviations = new HashMap<>();
InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName);
InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8);
CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT)
.setHeader("abbreviation", "expansion")
.build()
.parse(abbreviationsReader);
for (CSVRecord record : csv) {
String abbreviation = record.get("abbreviation");
String expansion = record.get("expansion");
abbreviations.put(abbreviation, expansion);
}
} catch (Exception exception) {
throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception);
}
}

private void initSymbolExpansion(String resourceName) throws MaryConfigurationException {
Expand Down Expand Up @@ -73,13 +94,34 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx

public MaryData process(MaryData d) {
Document doc = d.getDocument();
expandAllAbbreviations(doc);
expandAllSymbols(doc);
expandAllNumbers(doc);
MaryData result = new MaryData(getOutputType(), d.getLocale());
result.setDocument(doc);
return result;
}

private void expandAllAbbreviations(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element token;
while ((token = (Element) treeWalker.nextNode()) != null) {
String tokenText = MaryDomUtils.tokenText(token);
String expandedAbbreviation = expandAbbreviation(tokenText);
if (expandedAbbreviation != tokenText) {
MaryDomUtils.setTokenText(token, expandedAbbreviation);
}
}
}

protected String expandAbbreviation(String abbreviation) {
if (abbreviations.containsKey(abbreviation))
return abbreviations.get(abbreviation);
else
return abbreviation;
}

private void expandAllSymbols(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Expand Down Expand Up @@ -126,4 +168,4 @@ protected String spelloutNumber(Number number) {
return ruleBasedNumberFormat.format(number);
}

}
}
129 changes: 129 additions & 0 deletions src/main/resources/marytts/language/dsb/abbreviations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"abbreviation","expansion"
"kwart.","kwartal"
"kw.","kwartal"
"mjas.","mjasac"
"tyź.","tyźeń"
"góź.","góźina"
"min.","minutow"
"sek.","sekundow"
"pś.Kr.n.","pśed Kristusowym naroźenim"
"pś.n.l.c.","pśed našym licenim casa"
"pó Kr.n.","pó Kristusowym naroźenim"
"n.l.c.","našogo licenja casa"
"jan.","januar"
"feb.","februar"
"měr.","měrc"
"apr.","apryl"
"jun.","junij"
"jul.","julij"
"awg.","awgust"
"sep.","september"
"okt.","oktober"
"now.","nowember"
"dec.","december"
"dop.","dopołdnja"
"połd.","połdnjo"
"wótp.","wótpołdnja"
"nje.","njeźela"
"pón.","pónjeźele"
"wał.","wałtora"
"srj.","srjoda"
"stw.","stwórtk"
"pě.","pětk"
"sob.","sobota"
"nj.","njeźela"
"pó.","pónjeźele"
"sr.","wu."
"st.","stwórtk"
"tys.","tysac"
"mil.","milionow"
"mrd.","miliardow"
"bil.","bilionow"
"mio.","milionow"
"PLN","pólskich złotych"
"zł","pólskich złotych"
"Kč","českich kronow"
"EUR","eurow"
"DKK","dańskich kronow"
"DM","markow"
"CHF","šwicaŕskich frankow"
"CZK","českich krónow"
"HUF","hungorskich forintow"
"PLZ","pej el cet"
"RUB","rusojskich rublow"
"CNY","chinskich yuanow"
"CN¥","chinskich yuanow"
"JPY","japańskich yenow"
"AUD","awstralskich dolarow"
"NZ$","nowoseelandskich dolarow"
"dn.","dnjow"
"dn.","dnjow"
"cm","centimetrow"
"dm","decimetrow"
"ft","stopow"
"km","kilometrow"
"m","metrow"
"μm","mikrometrow"
"mm","milimetrow"
"nm","nanometrow"
"nmi","nawtiskich milow"
"cm²"," kwadratnych centimetrow"
"ft²","kwadratnych stopow"
"in²","kwadratnych cólow"
"km²","kwadratnych kilometrow"
"m²","kwadratnych metrow"
"mi²","kwadratnych milow"
"yd²","kwadratnych yardow"
"cm³","kubiknych centimetrow"
"ft³","kubiknych stopow"
"in³","kubiknych cólow"
"km³","kubiknych kilometrow"
"m³","kubiknych metrow"
"mi³","kubiknych milow"
"yd³","kubiknych yardow"
"fl. oz.","žydkich uncow"
"łž.","łžycow"
"łžk.","łžyckow"
"m/s²","metrow na kwadratnu sekundu"
"km/h","kilometrow na góźinu"
"m/s","metrow na sekundu"
"mph","milow na góźinu"
"kg","kilogramow"
"µg","mikrogramow"
"mg","miligramow"
"oz","uncow"
"oz. tr.","trojskich uncow"
"lb","puntow"
"cal","kalorijow"
"kcal","kilokalorijow"
"kJ","kilodžulow"
"kWh","kilowattowych góźin"
"GW","gigawattow"
"PS","kónjecych mócow"
"kW","kilowattow"
"MW","megawattow"
"mW","miliwattow"
"mA","milliamperow"
"Ω","ohmow"
"GHz","gigahertzow"
"Hz","hertzow"
"kHz","kilohertzow"
"MHz","megahertzow"
"hPa","hektopascalow"
"inHg","cólow žywoslobrowego stołpika"
"mbar","milibarow"
"mm Hg","milimetrow žywoslobrowego stołpika"
"°C","stopnjow Celsiusa"
"°F","stopnjow Fahrenheita"
"bit","bitow"
"byte","byteow"
"Gb","gigabitow"
"GB","gigabyteow"
"kb","kilobitow"
"kB","kilobyteow"
"Mb","megabitow"
"MB","megabyteow"
"Tb","terabitow"
"TB","terabyteow"
"l/km","litrow na kilometer"
"mpg","milow na galonu"
5 changes: 0 additions & 5 deletions src/main/resources/marytts/language/dsb/symbols.csv
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,8 @@ $,dolarow
=,rowna se
>,wětše ako
@,et
[,wótwórjona rožkata spinka
\,beksleš
],zawrjena rožkata spinka
^,wušej
{,wótwórjona wuzgibnjona spinka
|,padorowna smužka
},zawrjena wuzgibnjona spinka
~,tilda
°,stopnjow
"°C",stopnjow Celsius
Expand Down

0 comments on commit f2c06c8

Please sign in to comment.