Add Unicode-based Hangul decomposition functions

mooniker · Sep 21, 2019 · e355468 · e355468
1 parent 9f3d966
commit e355468
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 0 deletions.
diff --git a/hangul.decompose.js b/hangul.decompose.js
@@ -0,0 +1,60 @@
+// 3.12 Conjoining Jamo Behavior
+
+const SBase = 0xac00;
+const LBase = 0x1100;
+const VBase = 0x1161;
+const TBase = 0x11a7; // one less than the beginning of the range of trailing consonants
+const LCount = 19;
+// one more than the number of trailing consonants relevant to the decomposition algorithm: : (0x11C2 - 0x11A8 + 1) + 1
+const VCount = 21;
+const TCount = 28;
+// number of precomposed Hangul syllables starting with the same leading consonant, counting both the LV_Syllables and the LVT_Syllables for each possible trailing consonant
+const NCount = VCount * TCount; // 588
+const SCount = LCount * NCount; // 11172 - total number of precomposed Hangul syllables
+
+function arithmeticDecompositionMapingLV(s) {
+  const SIndex = s - SBase;
+
+  const LIndex = Math.floor(SIndex / NCount); // integer division rounded down
+  const VIndex = Math.floor((SIndex % NCount) / TCount);
+  const LPart = LBase + LIndex;
+  const VPart = VBase + VIndex;
+  return [LPart, VPart];
+}
+
+function arithmeticDecompositionMapingLVT(s) {
+  const SIndex = s - SBase;
+
+  const LVIndex = (SIndex / TCount) * TCount;
+  const TIndex = SIndex % TCount;
+  const LVPart = SBase + LVIndex;
+  const TPart = TBase + TIndex;
+
+  return [LVPart, TPart];
+}
+
+function fullCanonicalDecomposition(s) {
+  const SIndex = s.charCodeAt(0) - SBase;
+
+  const LIndex = Math.floor(SIndex / NCount);
+  const VIndex = Math.floor((SIndex % NCount) / TCount);
+  const TIndex = SIndex % TCount;
+  const LPart = LBase + LIndex;
+  const VPart = VBase + VIndex;
+
+  if (TIndex > 0) {
+    const TPart = TBase + TIndex;
+    return [LPart, VPart, TPart];
+  }
+
+  return [LPart, VPart];
+}
+
+// const hexToUnicodeChar = hex => String.fromCodePoint(hex);
+
+module.exports = {
+  arithmeticDecompositionMapingLV,
+  arithmeticDecompositionMapingLVT,
+  fullCanonicalDecomposition
+  //   hexToUnicodeChar
+};
diff --git a/hangul.decompose.test.js b/hangul.decompose.test.js
@@ -0,0 +1,58 @@
+const {
+  //   hexToUnicodeChar,
+  fullCanonicalDecomposition,
+  arithmeticDecompositionMapingLV,
+  arithmeticDecompositionMapingLVT
+} = require("./hangul.decompose");
+
+const hexCases = {
+  A: 0x41,
+  Z: 0x5a,
+  ㅎ: 0x1112,
+  ㅏ: 0x1161,
+  ㄴ: 0x11ab
+};
+
+// describe("hexToUnicodeChar function", () => {
+//   Object.entries(hexCases).forEach(([char, hex]) => {
+//     test(`should render ${hex} as ${char}`, () => {
+//       expect(hexToUnicodeChar(hex)).toBe(char);
+//     });
+//   });
+// });
+
+const hangulHexCases = {
+  [String.fromCodePoint(0xd55c)]: [0x1112, 0x1161, 0x11ab],
+  한: [0x1112, 0x1161, 0x11ab],
+  [String.fromCodePoint(0xd4db)]: [0x1111, 0x1171, 0x11b6],
+  퓛: [0x1111, 0x1171, 0x11b6],
+  서: [0x1109, 0x1165],
+  울: [0x110b, 0x116e, 0x11af],
+  평: [0x1111, 0x1167, 0x11bc],
+  양: [0x110b, 0x1163, 0x11bc]
+};
+
+describe("Hangul Unicode fullCanonicalDecomposition", () => {
+  Object.entries(hangulHexCases).forEach(([hangul, charCodes]) => {
+    test(`should decompose ${hangul} to character codes [${charCodes.join(
+      ","
+    )}]`, () => {
+      expect(fullCanonicalDecomposition(hangul)).toStrictEqual(charCodes);
+    });
+  });
+});
+
+describe("arithmeticDecompositionMapingLV", () => {
+  test("should pull out correct code points for ㅍ and ㅟ from 퓛", () => {
+    expect(arithmeticDecompositionMapingLV(0xd4db)).toStrictEqual([
+      0x1111,
+      0x1171
+    ]);
+  });
+});
+
+describe("arithmeticDecompositionMapingLVT", () => {
+  test("should pull out correct code point for trailing ㄹㅎ in 퓛", () => {
+    expect(arithmeticDecompositionMapingLVT(0xd4db)[1]).toBe(0x11b6);
+  });
+});