Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine file encoding helper. #2774

Merged
merged 7 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions ReleaseHistory.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* DEP: Remove spurious references to `System.Collections.Immutable`.
* DEP: Update `Microsoft.Data.SqlClient` reference from 2.1.2 to 2.1.7 in `WorkItems` and `Sarif.Multitool.Library` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* DEP: Update `System.Data.SqlClient` reference from 4.8.5 to 4.8.6 in `WorkItems` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* BUG: Improve `FileEncoding.IsTextualData` method to detect binary files.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: What specifically did it not do well/properly before? Are you fixing a particular case, or improving performance…?

* BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths.
* BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`.
* BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option.
Expand Down
72 changes: 22 additions & 50 deletions src/Sarif/FileEncoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;

using FastSerialization;

namespace Microsoft.CodeAnalysis.Sarif
{
Expand All @@ -28,79 +24,55 @@ public static bool IsTextualData(byte[] bytes)

/// <summary>
/// Detects if the provided byte array contains textual data. The heuristic applied
/// is to allow .NET to attempt to decode the byte array as a sequence of characters.
/// If that decoding generates does not generate a Unicode replacement character,
/// the data is classified as binary rather than text.
/// is to first examine the data for any control characters (<0x20). If none whatsoever
/// are found, the data is classified as textual. Otherwise, if the inspected bytes
/// decode successfully as either Unicode or UTF32, the data is classified as textual.
/// This helper does not currently properly handled detected UnicodeBigEndian data.
/// </summary>
/// <param name="bytes">The raw data expressed as bytes.</param>
/// <param name="start">The starting position to being classification.</param>
/// <param name="count">The maximal count of characters to decode.</param>
public static bool IsTextualData(byte[] bytes, int start, int count)
{
Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);
bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));

if (start >= bytes.Length)
{
throw new ArgumentOutOfRangeException(nameof(start), $"Buffer size ({bytes.Length}) not valid for start ({start}) argument.");
}

foreach (Encoding encoding in new[] { Encoding.UTF8, Windows1252, Encoding.UTF32 })
{
if (count < bytes.Length)
{
var span = new ReadOnlySpan<byte>(bytes, start, count);
bytes = span.ToArray();
}

using var reader = new StreamReader(new MemoryStream(bytes), encoding, detectEncodingFromByteOrderMarks: true);
reader.BaseStream.Seek(start, SeekOrigin.Begin);
Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);

bool isTextual = true;
bool continueProcessing = true;
bool containsControlCharacters = false;

for (int i = start; i < count; i++)
{
int ch = reader.Read();
for (int i = 0; i < bytes.Length; i++)
{
containsControlCharacters |= bytes[i] < 0x20;
}

if (ch == -1)
{
break;
}
if (!containsControlCharacters)
{
return true;
}

// Because we enable 'detectEncodingFromByteOrderMarks' we will skip past any NUL
// characters in the file that might result from being BOM-prefixed. So any
// evidence of this character is an indicator of binary data.
if (encoding != Encoding.UTF8 && ch == '\0')
{
// This condition indicates binary data in all cases, when encountered for
// Windows 1252 or UTF32. For UTF8, we determine data is binary by observing
// that a character has been dropped in favor of the Unicode replacement char.
isTextual = false;
continueProcessing = false;
break;
}
foreach (Encoding encoding in new[] { Encoding.UTF32, Encoding.Unicode })
{
bool encodingSucceeded = true;

// Unicode REPLACEMENT CHARACTER (U+FFFD)
if (encoding == Encoding.UTF8 && ch == 65533)
foreach (char c in encoding.GetChars(bytes, start, count))
{
if (c == 0xfffd)
{
isTextual = false;
encodingSucceeded = false;
break;
}
}

if (!continueProcessing)
{
return isTextual;
}

if (isTextual)
if (encodingSucceeded)
{
return true;
}

// In this code path, a single encoding determined that the data was *not* textual,
// but we think we should continue to examine other text encodings to see the result.
}

return false;
Expand Down
78 changes: 53 additions & 25 deletions src/Test.UnitTests.Sarif/FileEncodingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.IO;
using System.Text;

using FluentAssertions;
using FluentAssertions.Execution;

using Microsoft.CodeAnalysis.Sarif.Driver;

using Xunit;

Expand All @@ -32,42 +34,74 @@ public void FileEncoding_StartExceedsBufferLength()
[Fact]
public void FileEncoding_FileEncoding_IsBinary()
{
ValidateIsBinary("Sarif.dll");
ValidateIsBinary("Sarif.pdb");
var assertionScope = new AssertionScope();

var binaryExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
".cer",
".der",
".pfx",
".dll",
".exe",
};

var observedExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);

var fileSpecifier = new FileSpecifier("*", recurse: false);

foreach (string fileName in fileSpecifier.Files)
{
string extension = Path.GetExtension(fileName);
if (observedExtensions.Contains(extension)) { continue; }
observedExtensions.Add(extension);

using FileStream reader = File.OpenRead(fileName);
int bufferSize = 1024;
byte[] bytes = new byte[bufferSize];
reader.Read(bytes, 0, bufferSize);
bool isTextual = FileEncoding.IsTextualData(bytes);

if (!isTextual)
{
binaryExtensions.Should().Contain(extension, because: $"'{fileName}' was classified as a binary file");
}
}
}

private void ValidateIsBinary(string fileName)
{
if (!File.Exists(fileName))
{
fileName = Path.Combine(Environment.CurrentDirectory, fileName);
}

fileName = Path.Combine(Environment.CurrentDirectory, fileName);
using FileStream reader = File.OpenRead(fileName);
int bufferSize = 1024;
byte[] bytes = new byte[bufferSize];
reader.Read(bytes, 0, bufferSize);
FileEncoding.IsTextualData(bytes).Should().BeFalse();
FileEncoding.IsTextualData(bytes).Should().BeFalse(because: $"{fileName} is a binary file");
}

[Fact]
public void FileEncoding_UnicodeDataIsTextual()
{
using var assertionScope = new AssertionScope();

var sb = new StringBuilder();
string unicodeText = "американец";

foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 })
{
byte[] input = encoding.GetBytes(unicodeText);
if (!FileEncoding.IsTextualData(input))
{
sb.AppendLine($"\tThe '{encoding.EncodingName}' encoding classified unicode text '{unicodeText}' as binary data.");
}
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data");
}

sb.Length.Should().Be(0, because: $"all unicode strings should be classified as textual:{Environment.NewLine}{sb}");
}

[Fact]
public void FileEncoding_BinaryDataIsBinary()
{
var sb = new StringBuilder();
using var assertionScope = new AssertionScope();

foreach (string binaryName in new[] { "Certificate.cer", "Certificate.der", "PasswordProtected.pfx" })
{
Expand All @@ -80,13 +114,8 @@ public void FileEncoding_BinaryDataIsBinary()
Stream = resource,
};

if (artifact.Bytes == null)
{
sb.AppendLine($"\tBinary file '{binaryName}' was classified as textual data.");
}
artifact.Bytes.Should().NotBeNull(because: $"'{binaryName}' should be classified as binary data.");
}

sb.Length.Should().Be(0, because: $"all binary files should be classified as binary:{Environment.NewLine}{sb}");
}

[Fact]
Expand All @@ -104,7 +133,8 @@ public void FileEncoding_AllUtf8EncodingsAreTextual()

private static void ValidateEncoding(string encodingName, Encoding encoding)
{
var sb = new StringBuilder(65536 * 100);
using var assertionScope = new AssertionScope();

Stream resource = typeof(FileEncodingTests).Assembly.GetManifestResourceStream($"Test.UnitTests.Sarif.TestData.FileEncoding.{encodingName}.txt");
using var reader = new StreamReader(resource, Encoding.UTF8);

Expand All @@ -113,14 +143,12 @@ private static void ValidateEncoding(string encodingName, Encoding encoding)
{
char ch = (char)current;
byte[] input = encoding.GetBytes(new[] { ch });
if (!FileEncoding.IsTextualData(input))
{
string unicodeText = "\\u" + ((int)ch).ToString("d4");
sb.AppendLine($"\t{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) was classified as binary data.");
}
}

sb.Length.Should().Be(0, because: $"all {encodingName} encodable character should be classified as textual:{Environment.NewLine}{sb}");
if (ch < 0x20) { continue; }

string unicodeText = "\\u" + ((int)ch).ToString("d4");
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) should not classify as binary data.");
}
}
}
}