Skip to content

Commit

Permalink
Add Unicode-based Hangul decomposition functions
Browse files Browse the repository at this point in the history
  • Loading branch information
mooniker committed Sep 21, 2019
1 parent 9f3d966 commit e355468
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 0 deletions.
60 changes: 60 additions & 0 deletions hangul.decompose.js
@@ -0,0 +1,60 @@
// 3.12 Conjoining Jamo Behavior

const SBase = 0xac00;
const LBase = 0x1100;
const VBase = 0x1161;
const TBase = 0x11a7; // one less than the beginning of the range of trailing consonants
const LCount = 19;
// one more than the number of trailing consonants relevant to the decomposition algorithm: : (0x11C2 - 0x11A8 + 1) + 1
const VCount = 21;
const TCount = 28;
// number of precomposed Hangul syllables starting with the same leading consonant, counting both the LV_Syllables and the LVT_Syllables for each possible trailing consonant
const NCount = VCount * TCount; // 588
const SCount = LCount * NCount; // 11172 - total number of precomposed Hangul syllables

function arithmeticDecompositionMapingLV(s) {
const SIndex = s - SBase;

const LIndex = Math.floor(SIndex / NCount); // integer division rounded down
const VIndex = Math.floor((SIndex % NCount) / TCount);
const LPart = LBase + LIndex;
const VPart = VBase + VIndex;
return [LPart, VPart];
}

function arithmeticDecompositionMapingLVT(s) {
const SIndex = s - SBase;

const LVIndex = (SIndex / TCount) * TCount;
const TIndex = SIndex % TCount;
const LVPart = SBase + LVIndex;
const TPart = TBase + TIndex;

return [LVPart, TPart];
}

function fullCanonicalDecomposition(s) {
const SIndex = s.charCodeAt(0) - SBase;

const LIndex = Math.floor(SIndex / NCount);
const VIndex = Math.floor((SIndex % NCount) / TCount);
const TIndex = SIndex % TCount;
const LPart = LBase + LIndex;
const VPart = VBase + VIndex;

if (TIndex > 0) {
const TPart = TBase + TIndex;
return [LPart, VPart, TPart];
}

return [LPart, VPart];
}

// const hexToUnicodeChar = hex => String.fromCodePoint(hex);

module.exports = {
arithmeticDecompositionMapingLV,
arithmeticDecompositionMapingLVT,
fullCanonicalDecomposition
// hexToUnicodeChar
};
58 changes: 58 additions & 0 deletions hangul.decompose.test.js
@@ -0,0 +1,58 @@
const {
// hexToUnicodeChar,
fullCanonicalDecomposition,
arithmeticDecompositionMapingLV,
arithmeticDecompositionMapingLVT
} = require("./hangul.decompose");

const hexCases = {
A: 0x41,
Z: 0x5a,
ㅎ: 0x1112,
ㅏ: 0x1161,
ㄴ: 0x11ab
};

// describe("hexToUnicodeChar function", () => {
// Object.entries(hexCases).forEach(([char, hex]) => {
// test(`should render ${hex} as ${char}`, () => {
// expect(hexToUnicodeChar(hex)).toBe(char);
// });
// });
// });

const hangulHexCases = {
[String.fromCodePoint(0xd55c)]: [0x1112, 0x1161, 0x11ab],
한: [0x1112, 0x1161, 0x11ab],
[String.fromCodePoint(0xd4db)]: [0x1111, 0x1171, 0x11b6],
퓛: [0x1111, 0x1171, 0x11b6],
서: [0x1109, 0x1165],
울: [0x110b, 0x116e, 0x11af],
평: [0x1111, 0x1167, 0x11bc],
양: [0x110b, 0x1163, 0x11bc]
};

describe("Hangul Unicode fullCanonicalDecomposition", () => {
Object.entries(hangulHexCases).forEach(([hangul, charCodes]) => {
test(`should decompose ${hangul} to character codes [${charCodes.join(
","
)}]`, () => {
expect(fullCanonicalDecomposition(hangul)).toStrictEqual(charCodes);
});
});
});

describe("arithmeticDecompositionMapingLV", () => {
test("should pull out correct code points for ㅍ and ㅟ from 퓛", () => {
expect(arithmeticDecompositionMapingLV(0xd4db)).toStrictEqual([
0x1111,
0x1171
]);
});
});

describe("arithmeticDecompositionMapingLVT", () => {
test("should pull out correct code point for trailing ㄹㅎ in 퓛", () => {
expect(arithmeticDecompositionMapingLVT(0xd4db)[1]).toBe(0x11b6);
});
});

0 comments on commit e355468

Please sign in to comment.