Skip to content

Commit

Permalink
Fixing textual classifier's handling of misaligned data (#2780)
Browse files Browse the repository at this point in the history
* this sucked

* fix

* tightening change

* ut

* fix ut

* fix overscanning for control chars

* opt

* fix release notes
  • Loading branch information
scottoneil-ms committed Feb 9, 2024
1 parent 77dfb3a commit e7d897a
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 13 deletions.
2 changes: 1 addition & 1 deletion ReleaseHistory.md
Expand Up @@ -5,7 +5,7 @@
* DEP: Remove spurious references to `System.Collections.Immutable`.
* DEP: Update `Microsoft.Data.SqlClient` reference from 2.1.2 to 2.1.7 in `WorkItems` and `Sarif.Multitool.Library` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* DEP: Update `System.Data.SqlClient` reference from 4.8.5 to 4.8.6 in `WorkItems` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* BUG: Improve `FileEncoding.IsTextualData` method to detect binary files.
* BUG: Improve `FileEncoding.IsTextualData` method for detecting binary files.
* BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths.
* BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`.
* BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option.
Expand Down
5 changes: 3 additions & 2 deletions src/Sarif/EnumeratedArtifact.cs
Expand Up @@ -104,8 +104,9 @@ private void RetrieveDataFromStream()
}

byte[] header = new byte[BinarySniffingHeaderSizeBytes];
int length = this.Stream.Read(header, 0, header.Length);
bool isText = FileEncoding.IsTextualData(header, 0, length);
int readLength = this.Stream.Read(header, 0, header.Length);

bool isText = FileEncoding.IsTextualData(header, 0, readLength);

TryRewindStream();

Expand Down
12 changes: 8 additions & 4 deletions src/Sarif/FileEncoding.cs
Expand Up @@ -31,7 +31,7 @@ public static bool IsTextualData(byte[] bytes)
/// </summary>
/// <param name="bytes">The raw data expressed as bytes.</param>
/// <param name="start">The starting position to being classification.</param>
/// <param name="count">The maximal count of characters to decode.</param>
/// <param name="count">The maximal count of bytes to decode.</param>
public static bool IsTextualData(byte[] bytes, int start, int count)
{
bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
Expand All @@ -41,12 +41,11 @@ public static bool IsTextualData(byte[] bytes, int start, int count)
throw new ArgumentOutOfRangeException(nameof(start), $"Buffer size ({bytes.Length}) not valid for start ({start}) argument.");
}


Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);

bool containsControlCharacters = false;

for (int i = 0; i < bytes.Length; i++)
for (int i = 0; i < count; i++)
{
containsControlCharacters |= bytes[i] < 0x20;
}
Expand All @@ -56,11 +55,16 @@ public static bool IsTextualData(byte[] bytes, int start, int count)
return true;
}

// Ensure count % 4 == 0 to guarantee we do not attempt a misaligned decoding
// at the end of the buffer, under all tested encodings.
count = (count / 4) * 4;

foreach (Encoding encoding in new[] { Encoding.UTF32, Encoding.Unicode })
{
bool encodingSucceeded = true;

foreach (char c in encoding.GetChars(bytes, start, count))
char[] chars = encoding.GetChars(bytes, start, count);
foreach (char c in chars)
{
if (c == 0xfffd)
{
Expand Down
5 changes: 3 additions & 2 deletions src/Sarif/ZipArchiveArtifact.cs
Expand Up @@ -95,8 +95,9 @@ public byte[] Bytes
var peekable = new PeekableStream(this.Stream, PeekWindowBytes);

byte[] header = new byte[PeekWindowBytes];
int length = this.Stream.Read(header, 0, header.Length);
bool isText = FileEncoding.IsTextualData(header, 0, length);
int readLength = this.Stream.Read(header, 0, header.Length);

bool isText = FileEncoding.IsTextualData(header, 0, readLength);

peekable.Rewind();

Expand Down
1 change: 1 addition & 0 deletions src/Test.UnitTests.Sarif/EnumeratedArtifactTests.cs
Expand Up @@ -216,6 +216,7 @@ public void EnumeratedArtifact_FileSizeReturnedForInMemoryStream()

var stream = new MemoryStream();
stream.Write(contentBytes, 0, contentBytes.Length);
stream.Position = 0;

var enumeratedArtifact = new EnumeratedArtifact(FileSystem.Instance)
{
Expand Down
15 changes: 11 additions & 4 deletions src/Test.UnitTests.Sarif/FileEncodingTests.cs
Expand Up @@ -89,12 +89,19 @@ public void FileEncoding_UnicodeDataIsTextual()
using var assertionScope = new AssertionScope();

var sb = new StringBuilder();
string unicodeText = "американец";
string[] unicodeTexts = new[]
{
"американец",
"Generates a warning and an error for each of : foo foo \r\n" // Challenging for the classifer; found by accident.
};

foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 })
foreach (string unicodeText in unicodeTexts)
{
byte[] input = encoding.GetBytes(unicodeText);
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data");
foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 })
{
byte[] input = encoding.GetBytes(unicodeText);
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data");
}
}
}

Expand Down

0 comments on commit e7d897a

Please sign in to comment.