Skip to content

Commit

Permalink
Convert AsUtf8/16/32 properties to methods
Browse files Browse the repository at this point in the history
Since we might support different endianness in the future, it would
be best to change these to methods so we can just add an overload
that specifies the endianness without breaking everyone's code.
  • Loading branch information
mqudsi committed May 11, 2017
1 parent a609b01 commit 3b0bd18
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 134 deletions.
6 changes: 3 additions & 3 deletions tests/ConversionTests.cs
Expand Up @@ -36,7 +36,7 @@ public void Utf32ByteConversion()
foreach (var test in _tests)
{
var sequence = test.AsUnicodeSequence();
var utf32 = sequence.AsUtf32Bytes;
var utf32 = sequence.AsUtf32Bytes();

Assert.AreEqual(test, Encoding.UTF32.GetString(utf32.ToArray()));
}
Expand All @@ -48,7 +48,7 @@ public void Utf16ByteConversion()
foreach (var test in _tests)
{
var sequence = test.AsUnicodeSequence();
var utf16 = sequence.AsUtf16Bytes;
var utf16 = sequence.AsUtf16Bytes();

Assert.AreEqual(test, Encoding.Unicode.GetString(utf16.ToArray()));
}
Expand All @@ -60,7 +60,7 @@ public void Utf8ByteConversion()
foreach (var test in _tests)
{
var sequence = test.AsUnicodeSequence();
var utf8 = sequence.AsUtf8;
var utf8 = sequence.AsUtf8();

var encoding = new UTF8Encoding(false);
Assert.AreEqual(test, encoding.GetString(utf8.ToArray()));
Expand Down
158 changes: 73 additions & 85 deletions unicode/Codepoint.cs
Expand Up @@ -36,116 +36,104 @@ public Codepoint(string hexValue)
}
}

public UInt32 AsUtf32 => Value;
public uint AsUtf32() => Value;

/// <summary>
/// Returns an iterator that will enumerate over the big endian bytes in the UTF32 encoding of this codepoint.
/// </summary>
public IEnumerable<byte> AsUtf32Bytes
public IEnumerable<byte> AsUtf32Bytes()
{
get
{
//from highest to lowest
var utf32 = AsUtf32;
var b1 = (byte) (utf32 >> 24);
yield return b1;
var b2 = (byte) ((utf32 & 0x00FFFFFF) >> 16);
yield return b2;
var b3 = (byte) (((UInt16) utf32) >> 8);
yield return b3;
var b4 = (byte) utf32;
yield return b4;
}
//from highest to lowest
var utf32 = AsUtf32();
var b1 = (byte) (utf32 >> 24);
yield return b1;
var b2 = (byte) ((utf32 & 0x00FFFFFF) >> 16);
yield return b2;
var b3 = (byte) (((UInt16) utf32) >> 8);
yield return b3;
var b4 = (byte) utf32;
yield return b4;
}

//https://en.wikipedia.org/wiki/UTF-16
public IEnumerable<UInt16> AsUtf16
public IEnumerable<ushort> AsUtf16()
{
get
//U+0000 to U+D7FF and U+E000 to U+FFFF
if (Value <= 0xFFFF)
{
//U+0000 to U+D7FF and U+E000 to U+FFFF
if (Value <= 0xFFFF)
{
yield return (UInt16)Value;
}
//U+10000 to U+10FFFF
else if (Value >= 0x10000 && Value <= 0x10FFFF)
{
UInt32 newVal = Value - 0x010000; //leaving 20 bits
UInt16 high = (UInt16) ((newVal >> 10) + 0xD800);
System.Diagnostics.Debug.Assert(high <= 0xDBFF && high >= 0xD800);
yield return high;

UInt16 low = (UInt16) ((newVal & 0x03FF) + 0xDC00);
System.Diagnostics.Debug.Assert(low <= 0xDFFF && low >= 0xDC00);
yield return low;
}
else
{
throw new UnsupportedCodepointException();
}
yield return (UInt16) Value;
}
//U+10000 to U+10FFFF
else if (Value >= 0x10000 && Value <= 0x10FFFF)
{
UInt32 newVal = Value - 0x010000; //leaving 20 bits
UInt16 high = (UInt16) ((newVal >> 10) + 0xD800);
System.Diagnostics.Debug.Assert(high <= 0xDBFF && high >= 0xD800);
yield return high;

UInt16 low = (UInt16) ((newVal & 0x03FF) + 0xDC00);
System.Diagnostics.Debug.Assert(low <= 0xDFFF && low >= 0xDC00);
yield return low;
}
else
{
throw new UnsupportedCodepointException();
}
}

/// <summary>
/// Returns an iterator that will enumerate over the big endian bytes in the UTF16 encoding of this codepoint.
/// </summary>
public IEnumerable<byte> AsUtf16Bytes
public IEnumerable<byte> AsUtf16Bytes()
{
get
var utf16 = AsUtf16();
foreach (var u16 in utf16)
{
var utf16 = AsUtf16;
foreach (var u16 in utf16)
{
var high = (byte) (u16 >> 8);
yield return high;
var low = (byte) u16;
yield return low;
}
var high = (byte) (u16 >> 8);
yield return high;
var low = (byte) u16;
yield return low;
}
}

//https://en.wikipedia.org/wiki/UTF-8
public IEnumerable<byte> AsUtf8
public IEnumerable<byte> AsUtf8()
{
get
//up to 7 bits
if (Value <= 0x007F)
{
//up to 7 bits
if (Value <= 0x007F)
{
yield return (byte)Value;
yield break;
}

//up to 11 bits
if (Value <= 0x07FF)
{
yield return (byte)(0b11000000 | (0b00011111 & (Value >> 6))); //tag + upper 5 bits
yield return (byte)(0b10000000 | (0b00111111 & Value)); //tag + lower 6 bits
yield break;
}

//up to 16 bits
if (Value <= 0x0FFFF)
{
yield return (byte)(0b11100000 | (0b00001111 & (Value >> 12))); //tag + upper 4 bits
yield return (byte)(0b10000000 | (0b00111111 & (Value >> 6))); //tag + next 6 bits
yield return (byte)(0b10000000 | (0b00111111 & Value)); //tag + last 6 bits
yield break;
}

//up to 21 bits
if (Value <= 0x1FFFFF)
{
yield return (byte)(0b11110000 | (0b00000111 & (Value >> 18))); //tag + upper 3 bits
yield return (byte)(0b10000000 | (0b00111111 & (Value >> 12))); //tag + next 6 bits
yield return (byte)(0b10000000 | (0b00111111 & (Value >> 6))); //tag + next 6 bits
yield return (byte)(0b10000000 | (0b00111111 & Value)); //tag + last 6 bits
yield break;
}
yield return (byte) Value;
yield break;
}

throw new UnsupportedCodepointException();
//up to 11 bits
if (Value <= 0x07FF)
{
yield return (byte) (0b11000000 | (0b00011111 & (Value >> 6))); //tag + upper 5 bits
yield return (byte) (0b10000000 | (0b00111111 & Value)); //tag + lower 6 bits
yield break;
}

//up to 16 bits
if (Value <= 0x0FFFF)
{
yield return (byte) (0b11100000 | (0b00001111 & (Value >> 12))); //tag + upper 4 bits
yield return (byte) (0b10000000 | (0b00111111 & (Value >> 6))); //tag + next 6 bits
yield return (byte) (0b10000000 | (0b00111111 & Value)); //tag + last 6 bits
yield break;
}

//up to 21 bits
if (Value <= 0x1FFFFF)
{
yield return (byte) (0b11110000 | (0b00000111 & (Value >> 18))); //tag + upper 3 bits
yield return (byte) (0b10000000 | (0b00111111 & (Value >> 12))); //tag + next 6 bits
yield return (byte) (0b10000000 | (0b00111111 & (Value >> 6))); //tag + next 6 bits
yield return (byte) (0b10000000 | (0b00111111 & Value)); //tag + last 6 bits
yield break;
}

throw new UnsupportedCodepointException();
}

public int CompareTo(Codepoint other)
Expand Down Expand Up @@ -224,7 +212,7 @@ public override string ToString()

public string AsString()
{
return Encoding.UTF8.GetString(AsUtf8.ToArray());
return Encoding.UTF8.GetString(AsUtf8().ToArray());
}

public bool IsIn(Range range)
Expand Down
6 changes: 3 additions & 3 deletions unicode/Range.cs
Expand Up @@ -53,7 +53,7 @@ public IEnumerable<UInt32> AsUtf32Sequence
{
for (UInt32 i = 0; Begin + i <= End; ++i)
{
yield return new Codepoint(Begin + i).AsUtf32;
yield return new Codepoint(Begin + i).AsUtf32();
}
}
}
Expand All @@ -64,7 +64,7 @@ public IEnumerable<UInt16> AsUtf16Sequence
{
for (var i = 0; Begin + i <= End; ++i)
{
foreach (var utf16 in new Codepoint(Begin + i).AsUtf16)
foreach (var utf16 in new Codepoint(Begin + i).AsUtf16())
{
yield return utf16;
}
Expand All @@ -78,7 +78,7 @@ public IEnumerable<byte> AsUtf8Sequence
{
for (var i = 0; Begin + i <= End; ++i)
{
foreach (var utf8 in new Codepoint(Begin + i).AsUtf8)
foreach (var utf8 in new Codepoint(Begin + i).AsUtf8())
{
yield return utf8;
}
Expand Down
2 changes: 1 addition & 1 deletion unicode/SingleEmoji.cs
Expand Up @@ -63,7 +63,7 @@ public override int GetHashCode()

public override string ToString()
{
return Encoding.Unicode.GetString(Sequence.AsUtf16Bytes.ToArray());
return Encoding.Unicode.GetString(Sequence.AsUtf16Bytes().ToArray());
}
}
}
4 changes: 3 additions & 1 deletion unicode/Unicode.csproj
Expand Up @@ -6,7 +6,7 @@
<RootNamespace>NeoSmart.Unicode</RootNamespace>
<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
<PackageId>Unicode.net</PackageId>
<Version>0.1.1</Version>
<Version>0.1.2</Version>
<Authors>NeoSmart Technologies, Mahmoud Al-Qudsi</Authors>
<Company>NeoSmart Technologies</Company>
<Description>A Unicode and emoji text-processing library for .NET, supporting UTF-8, UTF-16, and UTF-32 on .NET 2.0+ and .NET Standard (Core/UWP), with emoji support! 🔥🌶️😁🎉</Description>
Expand All @@ -17,6 +17,8 @@
<RepositoryType>git</RepositoryType>
<PackageTags>unicode, text processing, encoding, utf8, utf-8, utf-16, utf16, utf-32, utf32, emoji. emojis</PackageTags>
<PackageReleaseNotes>Initial release, including emoji support 🔥🌶️😁🎉!</PackageReleaseNotes>
<AssemblyVersion>0.1.2.0</AssemblyVersion>
<FileVersion>0.1.2.0</FileVersion>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'net20'">
Expand Down
67 changes: 26 additions & 41 deletions unicode/UnicodeSequence.cs
Expand Up @@ -60,74 +60,59 @@ public bool Contains(Codepoint codepoint)
return codepoint.In(_codepoints);
}

public IEnumerable<UInt32> AsUtf32
public IEnumerable<uint> AsUtf32()
{
get
foreach (var cp in _codepoints)
{
foreach (var cp in _codepoints)
{
yield return cp.AsUtf32;
}
yield return cp.AsUtf32();
}
}

public IEnumerable<byte> AsUtf32Bytes
public IEnumerable<byte> AsUtf32Bytes()
{
get
foreach (var u32 in AsUtf32())
{
foreach (var u32 in AsUtf32)
{
//little endian byte order
yield return (byte)(u32 & 0xFF);
yield return (byte)((u32 >> 8) & 0xFF);
yield return (byte)((u32 >> 16) & 0xFF);
yield return (byte)(u32 >> 24);
}
//little endian byte order
yield return (byte) (u32 & 0xFF);
yield return (byte) ((u32 >> 8) & 0xFF);
yield return (byte) ((u32 >> 16) & 0xFF);
yield return (byte) (u32 >> 24);
}
}

public IEnumerable<UInt16> AsUtf16
public IEnumerable<ushort> AsUtf16()
{
get
foreach (var cp in _codepoints)
{
foreach (var cp in _codepoints)
foreach (var us in cp.AsUtf16())
{
foreach (var us in cp.AsUtf16)
{
yield return us;
}
yield return us;
}
}
}

public IEnumerable<byte> AsUtf16Bytes
public IEnumerable<byte> AsUtf16Bytes()
{
get
foreach (var us in AsUtf16())
{
foreach (var us in AsUtf16)
{
//little endian byte order
yield return (byte)(us & 0xFF);
yield return (byte)(us >> 8);
}
//little endian byte order
yield return (byte) (us & 0xFF);
yield return (byte) (us >> 8);
}
}

public IEnumerable<byte> AsUtf8
public IEnumerable<byte> AsUtf8()
{
get
foreach (var cp in _codepoints)
{
foreach (var cp in _codepoints)
foreach (var b in cp.AsUtf8())
{
foreach (var b in cp.AsUtf8)
{
yield return b;
}
yield return b;
}
}
}

public string AsString => Encoding.Unicode.GetString(AsUtf16Bytes.ToArray());
public string AsString => Encoding.Unicode.GetString(AsUtf16Bytes().ToArray());

public int CompareTo(UnicodeSequence other)
{
Expand All @@ -145,9 +130,9 @@ public int CompareTo(UnicodeSequence other)
}
if (_codepoints.Length < other._codepoints.Length)
{
return -(int)other._codepoints[_codepoints.Length].AsUtf32;
return -(int)other._codepoints[_codepoints.Length].AsUtf32();
}
return (int)_codepoints[other._codepoints.Length].AsUtf32;
return (int)_codepoints[other._codepoints.Length].AsUtf32();
}

public bool Equals(UnicodeSequence other)
Expand Down

0 comments on commit 3b0bd18

Please sign in to comment.