Add basic/prelim support for RR, RR translit WIP

mooniker · Sep 30, 2019 · f649e80 · f649e80
1 parent 6202a08
commit f649e80
Show file tree

Hide file tree

Showing 3 changed files with 144 additions and 45 deletions.
diff --git a/jamo.js b/jamo.js
@@ -32,16 +32,17 @@ const jamoMapper = jamoSet => ({ jamo, roman }, idx) => {
   return Object.assign(jamoSet[idx], { roman, compatJamo, compatJamoHex });
 };
 
+// initial consonants
 const choseong = [
   { jamo: "ㄱ", roman: "g" },
-  { jamo: "ㄲ", roman: "gg" },
+  { jamo: "ㄲ", roman: "kk" },
   { jamo: "ㄴ", roman: "n" },
   { jamo: "ㄷ", roman: "d" },
   { jamo: "ㄸ", roman: "dd" },
-  { jamo: "ㄹ", roman: "r" },
+  { jamo: "ㄹ", roman: { default: "r", RRT: "l" } },
   { jamo: "ㅁ", roman: "m" },
   { jamo: "ㅂ", roman: "b" },
-  { jamo: "ㅃ", roman: "bb" },
+  { jamo: "ㅃ", roman: "pp" },
   { jamo: "ㅅ", roman: "s" },
   { jamo: "ㅆ", roman: "ss" },
   { jamo: "ㅇ", roman: "" },
@@ -54,6 +55,7 @@ const choseong = [
   { jamo: "ㅎ", roman: "h" }
 ].map(jamoMapper(initialConsonants));
 
+// medial vowels
 const jungseong = [
   { jamo: "ㅏ", roman: "a" },
   { jamo: "ㅐ", roman: "ae" },
@@ -78,15 +80,16 @@ const jungseong = [
   { jamo: "ㅣ", roman: "i" }
 ].map(jamoMapper(medialVowels));
 
+// final consonants
 const jongseong = [
   { jamo: null, roman: "" },
-  { jamo: "ㄱ", roman: "k" },
-  { jamo: "ㄲ", roman: "k" },
+  { jamo: "ㄱ", roman: { default: "k", vowelNext: "g", RRT: "g" } },
+  { jamo: "ㄲ", roman: "kk" },
   { jamo: "ㄳ", roman: "k" },
   { jamo: "ㄴ", roman: "n" },
   { jamo: "ㄵ", roman: "n" },
   { jamo: "ㄶ", roman: "n" },
-  { jamo: "ㄷ", roman: "d" },
+  { jamo: "ㄷ", roman: { default: "t", vowelNext: "d", RRT: "d" } },
   { jamo: "ㄹ", roman: "l" },
   { jamo: "ㄺ", roman: "r" },
   { jamo: "ㄻ", roman: "lm" },
@@ -96,17 +99,17 @@ const jongseong = [
   { jamo: "ㄿ", roman: "lp" },
   { jamo: "ㅀ", roman: "lh" },
   { jamo: "ㅁ", roman: "m" },
-  { jamo: "ㅂ", roman: "b" },
+  { jamo: "ㅂ", roman: { default: "p", vowelNext: "b", RRT: "b" } },
   { jamo: "ㅄ", roman: "bs" },
   { jamo: "ㅅ", roman: "s" },
   { jamo: "ㅆ", roman: "ss" },
   { jamo: "ㅇ", roman: "ng" },
-  { jamo: "ㅈ", roman: "j" },
-  { jamo: "ㅊ", roman: "ch" },
+  { jamo: "ㅈ", roman: { default: "t", vowelNext: "j" } },
+  { jamo: "ㅊ", roman: { default: "t", vowelNext: "ch", RRT: "ch" } },
   { jamo: "ㅋ", roman: "k" },
   { jamo: "ㅌ", roman: "t" },
   { jamo: "ㅍ", roman: "p" },
-  { jamo: "ㅎ", roman: "h" }
+  { jamo: "ㅎ", roman: { default: "t", RRT: "h" } }
 ].map(jamoMapper(finalConsonants));
 
 module.exports = [choseong, jungseong, jongseong];
diff --git a/romanize.js b/romanize.js
@@ -6,49 +6,71 @@ const _ = require("lodash");
 const getJamoDictionary = (jamo, idx) =>
   _.find(jamos[idx], { jamo }) || _.find(jamos[idx], { compatJamo: jamo });
 
-function searchJamo(node) {
+function searchJamo(node, params, prevNode) {
+  const { method, vowelNext } = params || {};
+  if (typeof node === "string") {
+    return node;
+  }
+
   if (!node) {
-    throw new Error("No node found.");
+    console.warn(prevNode);
+    throw new Error("No node found:" + node);
   }
 
-  if (typeof node === "string") {
-    return node;
+  // treat empty string (initial silent ieung/ㅇ as truthy)
+  if (node.roman || typeof node.roman === "string") {
+    return next(node.roman);
   }
 
-  if (node.roman) {
-    return searchJamo(node.roman);
+  if (method && (node[method] || typeof node[method] === "string")) {
+    return next(node[method]);
   }
 
-  throw new Error("unimplemented");
-}
+  // console.log(params, vowelNext, node.vowelNext, node);
+  if (vowelNext && (node.vowelNext || typeof node.vowelNext === "string")) {
+    return next(node.vowelNext);
+  }
 
-function romanize(text) {
-  return replaceHangul(text, romanizeWord);
+  if (node.default || typeof node.default === "string") {
+    return next(node.default);
+  }
+
+  console.warn(prevNode);
+  throw new Error("Unimplemented: " + JSON.stringify(node, null, 2));
+
+  function next(nextNode) {
+    return searchJamo(nextNode, params, node);
+  }
 }
 
-function parseSyllable(syllable, idx, syllabary) {
-  // next subsequent initial consonant (choseong)
-  const next = idx + 1 < syllabary.length ? syllabary[idx + 1][0] : null;
+const syllableParser = (method = "RR") =>
+  function(syllable, idx, word) {
+    // next subsequent initial consonant (choseong)
+    const next = idx + 1 < word.length ? word[idx + 1][0] : undefined;
+    const vowelNext = next === 0x110b || next === "ᄋ";
 
-  // previous adjacent trailing consonant (jongseong)
-  const prev = idx > 0 ? syllabary[idx - 1][2] : null;
+    // previous adjacent trailing consonant (jongseong)
+    // const prev = idx > 0 ? word[idx - 1][2] : undefined;
 
-  return syllable.map((jamo, idx, syllable) => {
-    const dict =
-      getJamoDictionary(jamo, idx) ||
-      getJamoDictionary(String.fromCodePoint(jamo), idx);
-    if (!dict) {
-      throw new Error("missing dict " + jamo);
-    }
+    return syllable.map((jamo, jamoIdx) => {
+      const dict =
+        getJamoDictionary(jamo, jamoIdx) ||
+        getJamoDictionary(String.fromCodePoint(jamo), jamoIdx);
 
-    return searchJamo(dict);
-  });
-}
+      if (!dict) {
+        throw new Error("missing dict " + jamo);
+      }
+
+      return searchJamo(dict, { method, vowelNext });
+    });
+  };
 
-const romanizeWord = word =>
+const romanizeWord = (word, method = "RR") =>
   decomposeHangul(word)
-    .map(parseSyllable)
+    .map(syllableParser(method))
     .reduce((acc, val) => acc.concat(val), [])
     .join("");
 
-module.exports = { romanizeWord, romanize };
+const romanize = (text, options) => replaceHangul(text, romanizeWord);
+
+module.exports = { syllableParser, romanizeWord, romanize };
diff --git a/romanize.test.js b/romanize.test.js
@@ -1,27 +1,101 @@
-const { romanize, romanizeWord } = require("./romanize");
+const { syllableParser, romanize, romanizeWord } = require("./romanize");
 const translations = require("./translations");
 
-const testWords = {
+const simpleWords = {
   가: "ga",
   나: "na",
   다: "da",
   로마자: "romaja",
+  표기법: "pyogibeop",
+  // 국어의: "gugeoui",
   만남: "mannam",
   동무: "dongmu"
 };
 
+const plosiveCases = {
+  구미: "Gumi",
+  영동: "Yeongdong",
+  백암: "Baegam",
+  옥천: "Okcheon",
+  합덕: "Hapdeok",
+  호법: "Hobeop",
+  월곶: "Wolgot", // [월곧]
+  벚꽃: "beotkkot", // [벋꼳]
+  한밭: "Hanbat" // [한받]
+};
+
+const wordsWithAdjacentConsonantAssimilation = {
+  백마: "Baengma", // [뱅마]
+  신문로: "Sinmunno", // [신문노]
+  종로: "Jongno", // [종노]
+  왕십리: "Wangsimni", // [왕심니]
+  별내: "Byeollae", // [별래]
+  신라: "Silla" // [실라]
+};
+
+const transliterationCases = {
+  집: "jib",
+  짚: "jip",
+  밖: "bakk",
+  값: "gabs",
+  붓꽃: "buskkoch",
+  먹는: "meogneun",
+  독립: "doglib",
+  문리: "munli",
+  // 물엿: "mul-yeos",
+  // 굳이: "gud-i",
+  좋다: "johda",
+  가곡: "gagog",
+  조랑말: "jolangmal"
+  // 없었습니다: "eobs-eoss-seubnida"
+};
+
 describe("romanizeWord function", () => {
-  Object.entries(testWords).forEach(([hangeul, romaja]) => {
-    test(`should romanize ${hangeul} to ${romaja}`, () => {
-      expect(romanizeWord(hangeul)).toBe(romaja);
+  describe("should romanize simple words", () => {
+    Object.entries(simpleWords).forEach(([hangulWord, expectedRomaja]) => {
+      test(`${hangulWord} to ${expectedRomaja}`, () => {
+        expect(romanizeWord(hangulWord)).toBe(expectedRomaja);
+      });
+    });
+  });
+
+  describe("should transcribe plosives/stops ㄱ, ㄷ, and ㅂ as 'g', 'd', and 'b' before a vowel and as 'k', 't', and 'p' when before another consonant or as the last sound of a word", () => {
+    Object.entries(plosiveCases).forEach(([hangulWord, expectedRomaja]) => {
+      test(`${hangulWord} to ${expectedRomaja}`, () => {
+        expect(romanizeWord(hangulWord)).toBe(expectedRomaja.toLowerCase());
+      });
     });
   });
+
+  describe("should transliterate", () => {
+    Object.entries(transliterationCases).forEach(
+      ([hangulWord, expectedRomaja]) => {
+        test(`${hangulWord} to ${expectedRomaja}`, () => {
+          expect(romanizeWord(hangulWord, "RRT")).toBe(expectedRomaja);
+        });
+      }
+    );
+  });
+
+  // describe("should romanize adjacent consonant assimilation", () => {
+  //   Object.entries(wordsWithAdjacentConsonantAssimilation).forEach(
+  //     ([hangulWord, expectedRomaja]) => {
+  //       test(`in ${hangulWord} to ${expectedRomaja.toLowerCase()}`, () => {
+  //         expect(romanizeWord(hangulWord)).toBe(expectedRomaja.toLowerCase());
+  //       });
+  //     }
+  //   );
+  // });
 });
 
 describe("romanize function", () => {
+  // test("should romanize Hangul string with spaces", () => {
+  //   expect(romanize("국어의 로마자 표기법")).toBe("gugeoui romaja pyogibeop");
+  // });
+
   test("should romanize 로마자 as romaja", () => {
-    expect(romanize("로마자 is the Korean word for Latin letters.")).toBe(
-      "romaja is the Korean word for Latin letters."
+    expect(romanize("The Korean word for Latin letters is 로마자.")).toBe(
+      "The Korean word for Latin letters is romaja."
     );
   });
 });