/
unicode.cpp
113 lines (103 loc) · 4.42 KB
/
unicode.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#include <nodec/unicode.hpp>
namespace nodec {
/*
* Unicode
* "
* The Unicode Standard defines a codespace of numerical values
* ranging from 0 through 10FFFF, called code points and
* denoted as U+0000 through U+10FFFF respectively.
* <https://en.wikipedia.org/wiki/Unicode>
* "
*
*/
namespace unicode {
namespace details {
/**
* UTF-8
* The x characters are replaced by the bits of the code point.
*
* |UTF-8 Number of bytes | Byte 1 | Byte 2 | Byte 3 | Byte 4 |
* +----------------------+-----------+-----------+-----------+-----------+
* | 1 | 0xxx xxxx | | | |
* | 2 | 110x xxxx | 10xx xxxx | | |
* | 3 | 111x xxxx | 10xx xxxx | 10xx xxxx | |
* | 4 | 1111 0xxx | 10xx xxxx | 10xx xxxx | 10xx xxxx |
*
* <https://en.wikipedia.org/wiki/UTF-8>
*/
/**
* UTF-16
* U+0000 to U+D7FF and U+E000 to U+FFFF
* Both UTF-16 and UCS-2 encode code points in this range as single 16-bit code units
* that are numerically equal to the corresponding code points.
* Code points from U+010000 to U+10FFFF
* U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
* W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
* W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
* U+D800 to U+DFFF
* The Unicode standard permanently reserves these code point values
* for UTF-16 encoding of the high and low surrogates,
* and they will never be assigned a character,
* so there should be no reason to encode them.
*
* | U+0000 |
* | ~ |
* | U+D7FF |
* | U+D800 | high_begin
* | ~ |
* | U+DBFF | hight_end
* | U+DC00 | low_begin
* | ~ |
* | U+DFFF | low_end
* | U+E000 |
* | ~ |
* | U+FFFF | max_bmp
* | ~ |
* | U+10FFFF |
*
* <https://en.wikipedia.org/wiki/UTF-16>
*/
const std::array<uint8_t, 7> FIRST_BYTE_MARK = {
0x00, // 0: 0000 0000
0x00, // 1: 0000 0000
0xC0, // 2: 1100 0000
0xE0, // 3: 1110 0000
0xF0, // 4: 1111 0000
0xF8, // 5: 1111 1000
0xFC // 6: 1111 1100
};
const std::array<uint8_t, 256> UTF8_BYTES = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0000 0000 ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0010 0000 ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0100 0000 ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0110 0000 ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1000 0000 ~
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1010 0000 ~
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1100 0000 ~
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 1110 0000 ~
3, 3, 3, 3, 3, 3, 3, 3, // 1111 0000 ~
4, 4, 4, 4, // 1111 1000 ~
5, 5, 5, 5 // 1111 1100 ~ 1111 1111
};
const std::array<uint32_t, 6> UTF8_OFFSETS = {
0x00000000UL, // 0: 0000 0000 0000 0000 0000 0000 0000 0000
0x00003080UL, // 1: 0000 0000 0000 0000 0011 0000 1000 0000
0x000E2080UL, // 2: 0000 0000 0000 1110 0010 0000 1000 0000
0x03C82080UL, // 3: 0000 0011 1100 1000 0010 0000 1000 0000
0xFA082080UL, // 4: 1111 1010 0000 1000 0010 0000 1000 0000
0x82082080UL // 5: 1000 0010 0000 1000 0010 0000 1000 0000
};
uint32_t check_strict(bool strict) {
/**
* Specials is a short Unicode block allocated at the very end of the Basic Multilingual Plane, at U+FFF0-FFFF.
* U+FFFD REPLACEMENT CHARACTER used to replace an unknown, unrecognized or unrepresentable character
*/
constexpr uint32_t replacement = 0x0000FFFD;
if (strict) {
details::throw_illegal_character_exception(__FILE__, __LINE__);
}
return replacement;
}
} // namespace details
} // namespace unicode
} // namespace nodec