Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Unicode-based Hangul decomposition functions
- Loading branch information
Showing
2 changed files
with
118 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// 3.12 Conjoining Jamo Behavior | ||
|
||
const SBase = 0xac00; | ||
const LBase = 0x1100; | ||
const VBase = 0x1161; | ||
const TBase = 0x11a7; // one less than the beginning of the range of trailing consonants | ||
const LCount = 19; | ||
// one more than the number of trailing consonants relevant to the decomposition algorithm: : (0x11C2 - 0x11A8 + 1) + 1 | ||
const VCount = 21; | ||
const TCount = 28; | ||
// number of precomposed Hangul syllables starting with the same leading consonant, counting both the LV_Syllables and the LVT_Syllables for each possible trailing consonant | ||
const NCount = VCount * TCount; // 588 | ||
const SCount = LCount * NCount; // 11172 - total number of precomposed Hangul syllables | ||
|
||
function arithmeticDecompositionMapingLV(s) { | ||
const SIndex = s - SBase; | ||
|
||
const LIndex = Math.floor(SIndex / NCount); // integer division rounded down | ||
const VIndex = Math.floor((SIndex % NCount) / TCount); | ||
const LPart = LBase + LIndex; | ||
const VPart = VBase + VIndex; | ||
return [LPart, VPart]; | ||
} | ||
|
||
function arithmeticDecompositionMapingLVT(s) { | ||
const SIndex = s - SBase; | ||
|
||
const LVIndex = (SIndex / TCount) * TCount; | ||
const TIndex = SIndex % TCount; | ||
const LVPart = SBase + LVIndex; | ||
const TPart = TBase + TIndex; | ||
|
||
return [LVPart, TPart]; | ||
} | ||
|
||
function fullCanonicalDecomposition(s) { | ||
const SIndex = s.charCodeAt(0) - SBase; | ||
|
||
const LIndex = Math.floor(SIndex / NCount); | ||
const VIndex = Math.floor((SIndex % NCount) / TCount); | ||
const TIndex = SIndex % TCount; | ||
const LPart = LBase + LIndex; | ||
const VPart = VBase + VIndex; | ||
|
||
if (TIndex > 0) { | ||
const TPart = TBase + TIndex; | ||
return [LPart, VPart, TPart]; | ||
} | ||
|
||
return [LPart, VPart]; | ||
} | ||
|
||
// const hexToUnicodeChar = hex => String.fromCodePoint(hex); | ||
|
||
module.exports = { | ||
arithmeticDecompositionMapingLV, | ||
arithmeticDecompositionMapingLVT, | ||
fullCanonicalDecomposition | ||
// hexToUnicodeChar | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
const { | ||
// hexToUnicodeChar, | ||
fullCanonicalDecomposition, | ||
arithmeticDecompositionMapingLV, | ||
arithmeticDecompositionMapingLVT | ||
} = require("./hangul.decompose"); | ||
|
||
const hexCases = { | ||
A: 0x41, | ||
Z: 0x5a, | ||
ㅎ: 0x1112, | ||
ㅏ: 0x1161, | ||
ㄴ: 0x11ab | ||
}; | ||
|
||
// describe("hexToUnicodeChar function", () => { | ||
// Object.entries(hexCases).forEach(([char, hex]) => { | ||
// test(`should render ${hex} as ${char}`, () => { | ||
// expect(hexToUnicodeChar(hex)).toBe(char); | ||
// }); | ||
// }); | ||
// }); | ||
|
||
const hangulHexCases = { | ||
[String.fromCodePoint(0xd55c)]: [0x1112, 0x1161, 0x11ab], | ||
한: [0x1112, 0x1161, 0x11ab], | ||
[String.fromCodePoint(0xd4db)]: [0x1111, 0x1171, 0x11b6], | ||
퓛: [0x1111, 0x1171, 0x11b6], | ||
서: [0x1109, 0x1165], | ||
울: [0x110b, 0x116e, 0x11af], | ||
평: [0x1111, 0x1167, 0x11bc], | ||
양: [0x110b, 0x1163, 0x11bc] | ||
}; | ||
|
||
describe("Hangul Unicode fullCanonicalDecomposition", () => { | ||
Object.entries(hangulHexCases).forEach(([hangul, charCodes]) => { | ||
test(`should decompose ${hangul} to character codes [${charCodes.join( | ||
"," | ||
)}]`, () => { | ||
expect(fullCanonicalDecomposition(hangul)).toStrictEqual(charCodes); | ||
}); | ||
}); | ||
}); | ||
|
||
describe("arithmeticDecompositionMapingLV", () => { | ||
test("should pull out correct code points for ㅍ and ㅟ from 퓛", () => { | ||
expect(arithmeticDecompositionMapingLV(0xd4db)).toStrictEqual([ | ||
0x1111, | ||
0x1171 | ||
]); | ||
}); | ||
}); | ||
|
||
describe("arithmeticDecompositionMapingLVT", () => { | ||
test("should pull out correct code point for trailing ㄹㅎ in 퓛", () => { | ||
expect(arithmeticDecompositionMapingLVT(0xd4db)[1]).toBe(0x11b6); | ||
}); | ||
}); |