diff --git a/data/derive_associated_phrases.py b/data/derive_associated_phrases.py deleted file mode 100644 index 728b42a..0000000 --- a/data/derive_associated_phrases.py +++ /dev/null @@ -1,113 +0,0 @@ -import argparse -from collections import defaultdict -from dataclasses import dataclass -from typing import List, Tuple -import unicodedata - -PRAGMA = "# format org.openvanilla.mcbopomofo.sorted" -MAX_ENTRIES_PER_PREFIX = 60 -EMOJI_SCORE = -8.0 - - -@dataclass -class Entry: - reading: str - value: str - score: float - - _zipped_readings_and_values_computed: bool = False - _cached_zipped_readings_and_values: List[Tuple[str, str]] = None - - def associated_phrase_line(self) -> str: - """Return a new Entry in the associated pharse format. - - An original entry of `ㄙˋ-ㄗˋ-ㄕㄡˊ-ㄩˇ 四字熟語 -7.28009397` - will return a line `四-ㄙˋ-字-ㄗˋ-熟-ㄕㄡˊ-語-ㄩˇ -7.28009397`. - """ - rvs = self.zipped_readings_and_values() - if not rvs: - return None - - if len(rvs) < 2: - return - - parts = [f"{v}-{r}" for r, v in rvs] - - return "%s %.4f" % ("-".join(parts), self.score) - - def zipped_readings_and_values(self) -> List[Tuple[str, str]]: - """Returns the readings and values zipped together. - - This only considers the values that entirely consist of Unicode - characters of the category "Lo" (Letter, other). - """ - if self._zipped_readings_and_values_computed: - return self._cached_zipped_readings_and_values - - self._zipped_readings_and_values_computed = True - - if self.reading.startswith("_"): # No punctuations - return None - - if self.score <= EMOJI_SCORE: # No emojis or other symbols - return None - - if not all(unicodedata.category(c) == "Lo" for c in self.value): - # Everything needs to be a Unicode "Lo" character - return None - - reading_parts = self.reading.split("-") - if len(reading_parts) != len(self.value): - return None - - self._cached_zipped_readings_and_values = list(zip(reading_parts, self.value)) - return self._cached_zipped_readings_and_values - - @classmethod - def from_line(cls, line): - reading, value, score = line.strip().split(" ") - return cls(reading, value, float(score)) - - -def main(source_file, target_file): - with open(source_file, "r") as f: - if f.readline().strip() != PRAGMA: - raise ValueError("Invalid source file") - lines = [line.strip() for line in f] - - entries = [Entry.from_line(line) for line in lines[1:]] - - prefix_entry_map = defaultdict(list) - - for e in entries: - zipped_rvs = e.zipped_readings_and_values() - if not zipped_rvs or len(zipped_rvs) < 2: - continue - - prefix = f"{zipped_rvs[0][1]}-{zipped_rvs[0][0]}" - prefix_entry_map[prefix].append(e) - - output_lines = [] - keys = sorted(prefix_entry_map.keys(), key=lambda k: k.encode("utf-8")) - for k in keys: - entries = sorted(prefix_entry_map[k], key=lambda e: e.score, reverse=True) - - output_lines.extend( - [e.associated_phrase_line() for e in entries[:MAX_ENTRIES_PER_PREFIX]] - ) - - byte_sorted_output_lines = sorted(output_lines, key=lambda x: x.encode("utf-8")) - - with open(target_file, "w") as f: - print(PRAGMA, file=f) - for line in byte_sorted_output_lines: - print(line, file=f) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("source", help="source file") - parser.add_argument("target", help="target file") - args = parser.parse_args() - - main(args.source, args.target) diff --git a/src/Engine/AssociatedPhrasesV2.cpp b/src/Engine/AssociatedPhrasesV2.cpp index e78a2be..bd6f8c4 100644 --- a/src/Engine/AssociatedPhrasesV2.cpp +++ b/src/Engine/AssociatedPhrasesV2.cpp @@ -121,6 +121,8 @@ void AssociatedPhrasesV2::close() { mmapedFile_.close(); } +bool AssociatedPhrasesV2::isLoaded() const { return db_ != nullptr; } + bool AssociatedPhrasesV2::open(std::unique_ptr db) { if (db_ != nullptr) { return false; diff --git a/src/Engine/AssociatedPhrasesV2.h b/src/Engine/AssociatedPhrasesV2.h index 60622eb..916bfce 100644 --- a/src/Engine/AssociatedPhrasesV2.h +++ b/src/Engine/AssociatedPhrasesV2.h @@ -40,6 +40,7 @@ class AssociatedPhrasesV2 { bool open(const char* path); void close(); + bool isLoaded() const; // Allows the use of existing in-memory db. bool open(std::unique_ptr db); diff --git a/src/Engine/McBopomofoLM.cpp b/src/Engine/McBopomofoLM.cpp index 8cb1edd..0bb3107 100644 --- a/src/Engine/McBopomofoLM.cpp +++ b/src/Engine/McBopomofoLM.cpp @@ -43,7 +43,9 @@ void McBopomofoLM::loadLanguageModel(const char* languageModelDataPath) { } } -bool McBopomofoLM::isDataModelLoaded() { return languageModel_.isLoaded(); } +bool McBopomofoLM::isDataModelLoaded() const { + return languageModel_.isLoaded(); +} void McBopomofoLM::loadAssociatedPhrasesV2(const char* associatedPhrasesPath) { if (associatedPhrasesPath) { @@ -64,6 +66,10 @@ void McBopomofoLM::loadUserPhrases(const char* userPhrasesDataPath, } } +bool McBopomofoLM::isAssociatedPhrasesV2Loaded() const { + return associatedPhrasesV2_.isLoaded(); +} + void McBopomofoLM::loadPhraseReplacementMap(const char* phraseReplacementPath) { if (phraseReplacementPath) { phraseReplacement_.close(); diff --git a/src/Engine/McBopomofoLM.h b/src/Engine/McBopomofoLM.h index 1492628..888cdd2 100644 --- a/src/Engine/McBopomofoLM.h +++ b/src/Engine/McBopomofoLM.h @@ -67,11 +67,13 @@ class McBopomofoLM : public Formosa::Gramambular2::LanguageModel { // Loads (or reloads, if already loaded) the primary language model data file. void loadLanguageModel(const char* languageModelDataPath); - bool isDataModelLoaded(); + bool isDataModelLoaded() const; // Loads (or reloads if already loaded) the associated phrases data file. void loadAssociatedPhrasesV2(const char* associatedPhrasesPath); + bool isAssociatedPhrasesV2Loaded() const; + // Loads (or reloads if already loaded) both the user phrases and the excluded // phrases files. If one argument is passed a nullptr, that file will not // be loaded or reloaded. diff --git a/src/Engine/ParselessLM.cpp b/src/Engine/ParselessLM.cpp index 4dd5afb..2cd6211 100644 --- a/src/Engine/ParselessLM.cpp +++ b/src/Engine/ParselessLM.cpp @@ -34,7 +34,7 @@ namespace McBopomofo { -bool ParselessLM::isLoaded() { return mmapedFile_.data() != nullptr; } +bool ParselessLM::isLoaded() const { return db_ != nullptr; } bool ParselessLM::open(const char* path) { if (!mmapedFile_.open(path)) { diff --git a/src/Engine/ParselessLM.h b/src/Engine/ParselessLM.h index cf9ba91..892bfd2 100644 --- a/src/Engine/ParselessLM.h +++ b/src/Engine/ParselessLM.h @@ -42,7 +42,7 @@ class ParselessLM : public Formosa::Gramambular2::LanguageModel { ParselessLM& operator=(const ParselessLM&) = delete; ParselessLM& operator=(ParselessLM&&) = delete; - bool isLoaded(); + bool isLoaded() const; bool open(const char* path); void close(); diff --git a/src/KeyHandler.h b/src/KeyHandler.h index 3d19165..073d92a 100644 --- a/src/KeyHandler.h +++ b/src/KeyHandler.h @@ -235,9 +235,9 @@ class KeyHandler { // contain the prefix. This allows the following two scenarios: // // (1) the current walk is 得 and we want to pin the phrase 得到; in this - // case, - // the prefixReading is ㄉㄜˊ and prefixValue is 得, and the associated - // phrase's reading and value are ㄉㄜˊ-ㄉㄠˋ and 得到 respectively. + // case, the prefixReading is ㄉㄜˊ and prefixValue is 得, and the + // associated phrase's reading and value are ㄉㄜˊ-ㄉㄠˋ and 得到 + // respectively. // (2) the current walk is 得 but we want to pin the phrase 德性, coming from // the choosing-candidate state; in this case, the prefix reading and // value is now ㄉㄜˊ and 德, and the associated phrase is ㄉㄜˊ-ㄒㄧㄥˋ