From cffd1aa30832875d568495a7cad77f512f50b9e7 Mon Sep 17 00:00:00 2001 From: Jorropo Date: Sat, 4 Dec 2021 03:52:13 +0100 Subject: [PATCH] feat: add base256emoji --- multibase.csv | 51 ++-- rfcs/Base256Emoji.md | 541 ++++++++++++++++++++++++++++++++++++ tests/basic.csv | 1 + tests/leading_zero.csv | 1 + tests/two_leading_zeros.csv | 1 + 5 files changed, 570 insertions(+), 25 deletions(-) create mode 100644 rfcs/Base256Emoji.md diff --git a/multibase.csv b/multibase.csv index 33f4f09..7c7549d 100644 --- a/multibase.csv +++ b/multibase.csv @@ -1,25 +1,26 @@ -encoding, code, description, status -identity, 0x00, 8-bit binary (encoder and decoder keeps data unmodified), default -base2, 0, binary (01010101), candidate -base8, 7, octal, draft -base10, 9, decimal, draft -base16, f, hexadecimal, default -base16upper, F, hexadecimal, default -base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate -base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate -base32hexpad, t, rfc4648 case-insensitive - with padding, candidate -base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate -base32, b, rfc4648 case-insensitive - no padding, default -base32upper, B, rfc4648 case-insensitive - no padding, default -base32pad, c, rfc4648 case-insensitive - with padding, candidate -base32padupper, C, rfc4648 case-insensitive - with padding, candidate -base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base36, k, base36 [0-9a-z] case-insensitive - no padding, draft -base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft -base58btc, z, base58 bitcoin, default -base58flickr, Z, base58 flicker, candidate -base64, m, rfc4648 no padding, default -base64pad, M, rfc4648 with padding - MIME encoding, candidate -base64url, u, rfc4648 no padding, default -base64urlpad, U, rfc4648 with padding, default -proquint, p, PRO-QUINT https://arxiv.org/html/0901.4016, draft +encoding, code, description, status +identity, 0x00, 8-bit binary (encoder and decoder keeps data unmodified), default +base2, 0, binary (01010101), candidate +base8, 7, octal, draft +base10, 9, decimal, draft +base16, f, hexadecimal, default +base16upper, F, hexadecimal, default +base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexpad, t, rfc4648 case-insensitive - with padding, candidate +base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate +base32, b, rfc4648 case-insensitive - no padding, default +base32upper, B, rfc4648 case-insensitive - no padding, default +base32pad, c, rfc4648 case-insensitive - with padding, candidate +base32padupper, C, rfc4648 case-insensitive - with padding, candidate +base32z, h, z-base-32 (used by Tahoe-LAFS), draft +base36, k, base36 [0-9a-z] case-insensitive - no padding, draft +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft +base58btc, z, base58 bitcoin, default +base58flickr, Z, base58 flicker, candidate +base64, m, rfc4648 no padding, default +base64pad, M, rfc4648 with padding - MIME encoding, candidate +base64url, u, rfc4648 no padding, default +base64urlpad, U, rfc4648 with padding, default +proquint, p, PRO-QUINT https://arxiv.org/html/0901.4016, draft +base256emoji, ๐Ÿš€, base256 with custom alphabet using variable-sized-codepoints, draft diff --git a/rfcs/Base256Emoji.md b/rfcs/Base256Emoji.md new file mode 100644 index 0000000..54ec658 --- /dev/null +++ b/rfcs/Base256Emoji.md @@ -0,0 +1,541 @@ +# Base256Emoji + +This base is a benchmark / test / torture for implementations that wants +to support unicode. + +## Encoding + +Since both buffers and base256 items have 256 permutations per item the +encoding is trivial, there is a one to one correspondance between one UTF-32 +character and one byte value and you don't need to deal with any overflow or +padding. + +First allocate a an UTF-32 output string with a codepoint length of your input +buffer. + +Then for each index lookup in the correspondance table using the current byte +value as index and write the codepoint you found to your output buffer at the +same index. + +You can find out the correspondance using this table: + +Emoji|UTF Code|Byte Value +-+-+- +๐Ÿš€|U+1F680|0 +-+-+- +๐Ÿช|U+1FA90|1 +-+-+- +โ˜„|U+2604|2 +-+-+- +๐Ÿ›ฐ|U+1F6F0|3 +-+-+- +๐ŸŒŒ|U+1F30C|4 +-+-+- +๐ŸŒ‘|U+1F311|5 +-+-+- +๐ŸŒ’|U+1F312|6 +-+-+- +๐ŸŒ“|U+1F313|7 +-+-+- +๐ŸŒ”|U+1F314|8 +-+-+- +๐ŸŒ•|U+1F315|9 +-+-+- +๐ŸŒ–|U+1F316|10 +-+-+- +๐ŸŒ—|U+1F317|11 +-+-+- +๐ŸŒ˜|U+1F318|12 +-+-+- +๐ŸŒ|U+1F30D|13 +-+-+- +๐ŸŒ|U+1F30F|14 +-+-+- +๐ŸŒŽ|U+1F30E|15 +-+-+- +โ˜‰|U+2609|16 +-+-+- +โ˜€|U+2600|17 +-+-+- +๐Ÿ’ป|U+1F4BB|18 +-+-+- +๐Ÿ–ฅ|U+1F5A5|19 +-+-+- +๐Ÿ’พ|U+1F4BE|20 +-+-+- +๐Ÿ’ฟ|U+1F4BF|21 +-+-+- +๐Ÿ˜‚|U+1F602|22 +-+-+- +โค|U+2764|23 +-+-+- +๐Ÿ˜|U+1F60D|24 +-+-+- +๐Ÿคฃ|U+1F923|25 +-+-+- +๐Ÿ˜Š|U+1F60A|26 +-+-+- +๐Ÿ™|U+1F64F|27 +-+-+- +๐Ÿ’•|U+1F495|28 +-+-+- +๐Ÿ˜ญ|U+1F62D|29 +-+-+- +๐Ÿ˜˜|U+1F618|30 +-+-+- +๐Ÿ‘|U+1F44D|31 +-+-+- +๐Ÿ˜…|U+1F605|32 +-+-+- +๐Ÿ‘|U+1F44F|33 +-+-+- +๐Ÿ˜|U+1F601|34 +-+-+- +๐Ÿ”ฅ|U+1F525|35 +-+-+- +๐Ÿฅฐ|U+1F970|36 +-+-+- +๐Ÿ’”|U+1F494|37 +-+-+- +๐Ÿ’–|U+1F496|38 +-+-+- +๐Ÿ’™|U+1F499|39 +-+-+- +๐Ÿ˜ข|U+1F622|40 +-+-+- +๐Ÿค”|U+1F914|41 +-+-+- +๐Ÿ˜†|U+1F606|42 +-+-+- +๐Ÿ™„|U+1F644|43 +-+-+- +๐Ÿ’ช|U+1F4AA|44 +-+-+- +๐Ÿ˜‰|U+1F609|45 +-+-+- +โ˜บ|U+263A|46 +-+-+- +๐Ÿ‘Œ|U+1F44C|47 +-+-+- +๐Ÿค—|U+1F917|48 +-+-+- +๐Ÿ’œ|U+1F49C|49 +-+-+- +๐Ÿ˜”|U+1F614|50 +-+-+- +๐Ÿ˜Ž|U+1F60E|51 +-+-+- +๐Ÿ˜‡|U+1F607|52 +-+-+- +๐ŸŒน|U+1F339|53 +-+-+- +๐Ÿคฆ|U+1F926|54 +-+-+- +๐ŸŽ‰|U+1F389|55 +-+-+- +๐Ÿ’ž|U+1F49E|56 +-+-+- +โœŒ|U+270C|57 +-+-+- +โœจ|U+2728|58 +-+-+- +๐Ÿคท|U+1F937|59 +-+-+- +๐Ÿ˜ฑ|U+1F631|60 +-+-+- +๐Ÿ˜Œ|U+1F60C|61 +-+-+- +๐ŸŒธ|U+1F338|62 +-+-+- +๐Ÿ™Œ|U+1F64C|63 +-+-+- +๐Ÿ˜‹|U+1F60B|64 +-+-+- +๐Ÿ’—|U+1F497|65 +-+-+- +๐Ÿ’š|U+1F49A|66 +-+-+- +๐Ÿ˜|U+1F60F|67 +-+-+- +๐Ÿ’›|U+1F49B|68 +-+-+- +๐Ÿ™‚|U+1F642|69 +-+-+- +๐Ÿ’“|U+1F493|70 +-+-+- +๐Ÿคฉ|U+1F929|71 +-+-+- +๐Ÿ˜„|U+1F604|72 +-+-+- +๐Ÿ˜€|U+1F600|73 +-+-+- +๐Ÿ–ค|U+1F5A4|74 +-+-+- +๐Ÿ˜ƒ|U+1F603|75 +-+-+- +๐Ÿ’ฏ|U+1F4AF|76 +-+-+- +๐Ÿ™ˆ|U+1F648|77 +-+-+- +๐Ÿ‘‡|U+1F447|78 +-+-+- +๐ŸŽถ|U+1F3B6|79 +-+-+- +๐Ÿ˜’|U+1F612|80 +-+-+- +๐Ÿคญ|U+1F92D|81 +-+-+- +โฃ|U+2763|82 +-+-+- +๐Ÿ˜œ|U+1F61C|83 +-+-+- +๐Ÿ’‹|U+1F48B|84 +-+-+- +๐Ÿ‘€|U+1F440|85 +-+-+- +๐Ÿ˜ช|U+1F62A|86 +-+-+- +๐Ÿ˜‘|U+1F611|87 +-+-+- +๐Ÿ’ฅ|U+1F4A5|88 +-+-+- +๐Ÿ™‹|U+1F64B|89 +-+-+- +๐Ÿ˜ž|U+1F61E|90 +-+-+- +๐Ÿ˜ฉ|U+1F629|91 +-+-+- +๐Ÿ˜ก|U+1F621|92 +-+-+- +๐Ÿคช|U+1F92A|93 +-+-+- +๐Ÿ‘Š|U+1F44A|94 +-+-+- +๐Ÿฅณ|U+1F973|95 +-+-+- +๐Ÿ˜ฅ|U+1F625|96 +-+-+- +๐Ÿคค|U+1F924|97 +-+-+- +๐Ÿ‘‰|U+1F449|98 +-+-+- +๐Ÿ’ƒ|U+1F483|99 +-+-+- +๐Ÿ˜ณ|U+1F633|100 +-+-+- +โœ‹|U+270B|101 +-+-+- +๐Ÿ˜š|U+1F61A|102 +-+-+- +๐Ÿ˜|U+1F61D|103 +-+-+- +๐Ÿ˜ด|U+1F634|104 +-+-+- +๐ŸŒŸ|U+1F31F|105 +-+-+- +๐Ÿ˜ฌ|U+1F62C|106 +-+-+- +๐Ÿ™ƒ|U+1F643|107 +-+-+- +๐Ÿ€|U+1F340|108 +-+-+- +๐ŸŒท|U+1F337|109 +-+-+- +๐Ÿ˜ป|U+1F63B|110 +-+-+- +๐Ÿ˜“|U+1F613|111 +-+-+- +โญ|U+2B50|112 +-+-+- +โœ…|U+2705|113 +-+-+- +๐Ÿฅบ|U+1F97A|114 +-+-+- +๐ŸŒˆ|U+1F308|115 +-+-+- +๐Ÿ˜ˆ|U+1F608|116 +-+-+- +๐Ÿค˜|U+1F918|117 +-+-+- +๐Ÿ’ฆ|U+1F4A6|118 +-+-+- +โœ”|U+2714|119 +-+-+- +๐Ÿ˜ฃ|U+1F623|120 +-+-+- +๐Ÿƒ|U+1F3C3|121 +-+-+- +๐Ÿ’|U+1F490|122 +-+-+- +โ˜น|U+2639|123 +-+-+- +๐ŸŽŠ|U+1F38A|124 +-+-+- +๐Ÿ’˜|U+1F498|125 +-+-+- +๐Ÿ˜ |U+1F620|126 +-+-+- +โ˜|U+261D|127 +-+-+- +๐Ÿ˜•|U+1F615|128 +-+-+- +๐ŸŒบ|U+1F33A|129 +-+-+- +๐ŸŽ‚|U+1F382|130 +-+-+- +๐ŸŒป|U+1F33B|131 +-+-+- +๐Ÿ˜|U+1F610|132 +-+-+- +๐Ÿ–•|U+1F595|133 +-+-+- +๐Ÿ’|U+1F49D|134 +-+-+- +๐Ÿ™Š|U+1F64A|135 +-+-+- +๐Ÿ˜น|U+1F639|136 +-+-+- +๐Ÿ—ฃ|U+1F5E3|137 +-+-+- +๐Ÿ’ซ|U+1F4AB|138 +-+-+- +๐Ÿ’€|U+1F480|139 +-+-+- +๐Ÿ‘‘|U+1F451|140 +-+-+- +๐ŸŽต|U+1F3B5|141 +-+-+- +๐Ÿคž|U+1F91E|142 +-+-+- +๐Ÿ˜›|U+1F61B|143 +-+-+- +๐Ÿ”ด|U+1F534|144 +-+-+- +๐Ÿ˜ค|U+1F624|145 +-+-+- +๐ŸŒผ|U+1F33C|146 +-+-+- +๐Ÿ˜ซ|U+1F62B|147 +-+-+- +โšฝ|U+26BD|148 +-+-+- +๐Ÿค™|U+1F919|149 +-+-+- +โ˜•|U+2615|150 +-+-+- +๐Ÿ†|U+1F3C6|151 +-+-+- +๐Ÿคซ|U+1F92B|152 +-+-+- +๐Ÿ‘ˆ|U+1F448|153 +-+-+- +๐Ÿ˜ฎ|U+1F62E|154 +-+-+- +๐Ÿ™†|U+1F646|155 +-+-+- +๐Ÿป|U+1F37B|156 +-+-+- +๐Ÿƒ|U+1F343|157 +-+-+- +๐Ÿถ|U+1F436|158 +-+-+- +๐Ÿ’|U+1F481|159 +-+-+- +๐Ÿ˜ฒ|U+1F632|160 +-+-+- +๐ŸŒฟ|U+1F33F|161 +-+-+- +๐Ÿงก|U+1F9E1|162 +-+-+- +๐ŸŽ|U+1F381|163 +-+-+- +โšก|U+26A1|164 +-+-+- +๐ŸŒž|U+1F31E|165 +-+-+- +๐ŸŽˆ|U+1F388|166 +-+-+- +โŒ|U+274C|167 +-+-+- +โœŠ|U+270A|168 +-+-+- +๐Ÿ‘‹|U+1F44B|169 +-+-+- +๐Ÿ˜ฐ|U+1F630|170 +-+-+- +๐Ÿคจ|U+1F928|171 +-+-+- +๐Ÿ˜ถ|U+1F636|172 +-+-+- +๐Ÿค|U+1F91D|173 +-+-+- +๐Ÿšถ|U+1F6B6|174 +-+-+- +๐Ÿ’ฐ|U+1F4B0|175 +-+-+- +๐Ÿ“|U+1F353|176 +-+-+- +๐Ÿ’ข|U+1F4A2|177 +-+-+- +๐ŸคŸ|U+1F91F|178 +-+-+- +๐Ÿ™|U+1F641|179 +-+-+- +๐Ÿšจ|U+1F6A8|180 +-+-+- +๐Ÿ’จ|U+1F4A8|181 +-+-+- +๐Ÿคฌ|U+1F92C|182 +-+-+- +โœˆ|U+2708|183 +-+-+- +๐ŸŽ€|U+1F380|184 +-+-+- +๐Ÿบ|U+1F37A|185 +-+-+- +๐Ÿค“|U+1F913|186 +-+-+- +๐Ÿ˜™|U+1F619|187 +-+-+- +๐Ÿ’Ÿ|U+1F49F|188 +-+-+- +๐ŸŒฑ|U+1F331|189 +-+-+- +๐Ÿ˜–|U+1F616|190 +-+-+- +๐Ÿ‘ถ|U+1F476|191 +-+-+- +๐Ÿฅด|U+1F974|192 +-+-+- +โ–ถ|U+25B6|193 +-+-+- +โžก|U+27A1|194 +-+-+- +โ“|U+2753|195 +-+-+- +๐Ÿ’Ž|U+1F48E|196 +-+-+- +๐Ÿ’ธ|U+1F4B8|197 +-+-+- +โฌ‡|U+2B07|198 +-+-+- +๐Ÿ˜จ|U+1F628|199 +-+-+- +๐ŸŒš|U+1F31A|200 +-+-+- +๐Ÿฆ‹|U+1F98B|201 +-+-+- +๐Ÿ˜ท|U+1F637|202 +-+-+- +๐Ÿ•บ|U+1F57A|203 +-+-+- +โš |U+26A0|204 +-+-+- +๐Ÿ™…|U+1F645|205 +-+-+- +๐Ÿ˜Ÿ|U+1F61F|206 +-+-+- +๐Ÿ˜ต|U+1F635|207 +-+-+- +๐Ÿ‘Ž|U+1F44E|208 +-+-+- +๐Ÿคฒ|U+1F932|209 +-+-+- +๐Ÿค |U+1F920|210 +-+-+- +๐Ÿคง|U+1F927|211 +-+-+- +๐Ÿ“Œ|U+1F4CC|212 +-+-+- +๐Ÿ”ต|U+1F535|213 +-+-+- +๐Ÿ’…|U+1F485|214 +-+-+- +๐Ÿง|U+1F9D0|215 +-+-+- +๐Ÿพ|U+1F43E|216 +-+-+- +๐Ÿ’|U+1F352|217 +-+-+- +๐Ÿ˜—|U+1F617|218 +-+-+- +๐Ÿค‘|U+1F911|219 +-+-+- +๐ŸŒŠ|U+1F30A|220 +-+-+- +๐Ÿคฏ|U+1F92F|221 +-+-+- +๐Ÿท|U+1F437|222 +-+-+- +โ˜Ž|U+260E|223 +-+-+- +๐Ÿ’ง|U+1F4A7|224 +-+-+- +๐Ÿ˜ฏ|U+1F62F|225 +-+-+- +๐Ÿ’†|U+1F486|226 +-+-+- +๐Ÿ‘†|U+1F446|227 +-+-+- +๐ŸŽค|U+1F3A4|228 +-+-+- +๐Ÿ™‡|U+1F647|229 +-+-+- +๐Ÿ‘|U+1F351|230 +-+-+- +โ„|U+2744|231 +-+-+- +๐ŸŒด|U+1F334|232 +-+-+- +๐Ÿ’ฃ|U+1F4A3|233 +-+-+- +๐Ÿธ|U+1F438|234 +-+-+- +๐Ÿ’Œ|U+1F48C|235 +-+-+- +๐Ÿ“|U+1F4CD|236 +-+-+- +๐Ÿฅ€|U+1F940|237 +-+-+- +๐Ÿคข|U+1F922|238 +-+-+- +๐Ÿ‘…|U+1F445|239 +-+-+- +๐Ÿ’ก|U+1F4A1|240 +-+-+- +๐Ÿ’ฉ|U+1F4A9|241 +-+-+- +๐Ÿ‘|U+1F450|242 +-+-+- +๐Ÿ“ธ|U+1F4F8|243 +-+-+- +๐Ÿ‘ป|U+1F47B|244 +-+-+- +๐Ÿค|U+1F910|245 +-+-+- +๐Ÿคฎ|U+1F92E|246 +-+-+- +๐ŸŽผ|U+1F3BC|247 +-+-+- +๐Ÿฅต|U+1F975|248 +-+-+- +๐Ÿšฉ|U+1F6A9|249 +-+-+- +๐ŸŽ|U+1F34E|250 +-+-+- +๐ŸŠ|U+1F34A|251 +-+-+- +๐Ÿ‘ผ|U+1F47C|252 +-+-+- +๐Ÿ’|U+1F48D|253 +-+-+- +๐Ÿ“ฃ|U+1F4E3|254 +-+-+- +๐Ÿฅ‚|U+1F942|255 + +## Decoding + +It is the same as encoding but the other way around. + +Note it is not recomanded to use a 8 gigabytes `UTF-32 codepoint` -> +`struct {bool, byte}`, it might be wise to a hash map instead. diff --git a/tests/basic.csv b/tests/basic.csv index 2b944a2..8c7a892 100644 --- a/tests/basic.csv +++ b/tests/basic.csv @@ -21,3 +21,4 @@ base64, "meWVzIG1hbmkgIQ" base64pad, "MeWVzIG1hbmkgIQ==" base64url, "ueWVzIG1hbmkgIQ" base64urlpad, "UeWVzIG1hbmkgIQ==" +base256emoji, "๐Ÿš€๐Ÿƒโœ‹๐ŸŒˆ๐Ÿ˜…๐ŸŒท๐Ÿคค๐Ÿ˜ป๐ŸŒŸ๐Ÿ˜…๐Ÿ‘" diff --git a/tests/leading_zero.csv b/tests/leading_zero.csv index eb3d91f..c9bfc7e 100644 --- a/tests/leading_zero.csv +++ b/tests/leading_zero.csv @@ -21,3 +21,4 @@ base64, "mAHllcyBtYW5pICE" base64pad, "MAHllcyBtYW5pICE=" base64url, "uAHllcyBtYW5pICE" base64urlpad, "UAHllcyBtYW5pICE=" +base256emoji, "๐Ÿš€๐Ÿš€๐Ÿƒโœ‹๐ŸŒˆ๐Ÿ˜…๐ŸŒท๐Ÿคค๐Ÿ˜ป๐ŸŒŸ๐Ÿ˜…๐Ÿ‘" diff --git a/tests/two_leading_zeros.csv b/tests/two_leading_zeros.csv index 44e6b26..f591063 100644 --- a/tests/two_leading_zeros.csv +++ b/tests/two_leading_zeros.csv @@ -21,3 +21,4 @@ base64, "mAAB5ZXMgbWFuaSAh" base64pad, "MAAB5ZXMgbWFuaSAh" base64url, "uAAB5ZXMgbWFuaSAh" base64urlpad, "UAAB5ZXMgbWFuaSAh" +base256emoji, "๐Ÿš€๐Ÿš€๐Ÿš€๐Ÿƒโœ‹๐ŸŒˆ๐Ÿ˜…๐ŸŒท๐Ÿคค๐Ÿ˜ป๐ŸŒŸ๐Ÿ˜…๐Ÿ‘"