From 997c9ed1b286d7e28b0c29e0084b8fcf1d9d4415 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:33:21 +0000 Subject: [PATCH 1/6] Initial plan From 879e0a2f8df6942423d853ff6c801eff4a959539 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:43:02 +0000 Subject: [PATCH 2/6] Add custom extractor support with interfaces and tests Co-authored-by: gfs <98900+gfs@users.noreply.github.com> --- .../ExtractorTests/CustomExtractorTests.cs | 320 ++++++++++++++++++ RecursiveExtractor/Extractor.cs | 124 ++++++- .../CustomAsyncExtractorInterface.cs | 20 ++ 3 files changed, 461 insertions(+), 3 deletions(-) create mode 100644 RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs create mode 100644 RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs new file mode 100644 index 0000000..701a918 --- /dev/null +++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs @@ -0,0 +1,320 @@ +using Microsoft.CST.RecursiveExtractor; +using Microsoft.CST.RecursiveExtractor.Extractors; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Threading.Tasks; + +namespace RecursiveExtractor.Tests.ExtractorTests; + +[TestClass] +public class CustomExtractorTests +{ + /// + /// A simple test custom extractor that extracts files with a specific magic number + /// For testing purposes, it recognizes files starting with "CUSTOM1" + /// + private class TestCustomExtractor : CustomAsyncExtractorInterface + { + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1"); + + public TestCustomExtractor(Extractor ctx) + { + context = ctx; + } + + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES)) + { + return true; + } + return false; + } + finally + { + stream.Position = initialPosition; + } + } + + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // For this test, we just return a synthetic file entry showing the custom extractor worked + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor")); + yield return new FileEntry("extracted_from_custom.txt", content, fileEntry); + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // For this test, we just return a synthetic file entry showing the custom extractor worked + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor")); + yield return new FileEntry("extracted_from_custom.txt", content, fileEntry); + await Task.CompletedTask; + } + } + + /// + /// A second test custom extractor that recognizes files starting with "CUSTOM2" + /// + private class SecondTestCustomExtractor : CustomAsyncExtractorInterface + { + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2"); + + public SecondTestCustomExtractor(Extractor ctx) + { + context = ctx; + } + + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES)) + { + return true; + } + return false; + } + finally + { + stream.Position = initialPosition; + } + } + + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor")); + yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry); + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor")); + yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry); + await Task.CompletedTask; + } + } + + [TestMethod] + public void AddCustomExtractor_ValidExtractor_ReturnsTrue() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + + var result = extractor.AddCustomExtractor(customExtractor); + + Assert.IsTrue(result); + Assert.AreEqual(1, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void AddCustomExtractor_DuplicateExtractor_ReturnsFalse() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + + extractor.AddCustomExtractor(customExtractor); + var result = extractor.AddCustomExtractor(customExtractor); + + Assert.IsFalse(result); + Assert.AreEqual(1, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void AddCustomExtractor_NullExtractor_ThrowsArgumentNullException() + { + var extractor = new Extractor(); + + Assert.ThrowsException(() => extractor.AddCustomExtractor(null!)); + } + + [TestMethod] + public void RemoveCustomExtractor_ExistingExtractor_ReturnsTrue() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + extractor.AddCustomExtractor(customExtractor); + + var result = extractor.RemoveCustomExtractor(customExtractor); + + Assert.IsTrue(result); + Assert.AreEqual(0, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void RemoveCustomExtractor_NonExistentExtractor_ReturnsFalse() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + + var result = extractor.RemoveCustomExtractor(customExtractor); + + Assert.IsFalse(result); + Assert.AreEqual(0, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void RemoveCustomExtractor_NullExtractor_ThrowsArgumentNullException() + { + var extractor = new Extractor(); + + Assert.ThrowsException(() => extractor.RemoveCustomExtractor(null!)); + } + + [TestMethod] + public void ClearCustomExtractors_RemovesAllExtractors() + { + var extractor = new Extractor(); + extractor.AddCustomExtractor(new TestCustomExtractor(extractor)); + extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor)); + + Assert.AreEqual(2, extractor.CustomExtractors.Count); + + extractor.ClearCustomExtractors(); + + Assert.AreEqual(0, extractor.CustomExtractors.Count); + } + + [TestMethod] + public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + extractor.AddCustomExtractor(customExtractor); + + // Create a test file with the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = extractor.Extract("test.custom", testData).ToList(); + + Assert.AreEqual(1, results.Count); + Assert.AreEqual("extracted_from_custom.txt", results[0].Name); + + // Read the content to verify it was processed by our custom extractor + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("Extracted by TestCustomExtractor", content); + } + + [TestMethod] + public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + extractor.AddCustomExtractor(customExtractor); + + // Create a test file with the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync(); + + Assert.AreEqual(1, results.Count); + Assert.AreEqual("extracted_from_custom.txt", results[0].Name); + + // Read the content to verify it was processed by our custom extractor + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("Extracted by TestCustomExtractor", content); + } + + [TestMethod] + public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + extractor.AddCustomExtractor(customExtractor); + + // Create a test file that doesn't match the custom magic bytes + var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data"); + var results = extractor.Extract("test.txt", testData).ToList(); + + // Should return the original file since no custom extractor matched + Assert.AreEqual(1, results.Count); + Assert.AreEqual("test.txt", results[0].Name); + + // Verify it's the original content + using var reader = new StreamReader(results[0].Content); + results[0].Content.Position = 0; + var content = reader.ReadToEnd(); + Assert.AreEqual("NOTCUSTOM This is test data", content); + } + + [TestMethod] + public void Extract_MultipleCustomExtractors_UsesCorrectOne() + { + var extractor = new Extractor(); + extractor.AddCustomExtractor(new TestCustomExtractor(extractor)); + extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor)); + + // Test with first custom format + var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data"); + var results1 = extractor.Extract("test1.custom", testData1).ToList(); + Assert.AreEqual(1, results1.Count); + Assert.AreEqual("extracted_from_custom.txt", results1[0].Name); + + // Test with second custom format + var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data"); + var results2 = extractor.Extract("test2.custom", testData2).ToList(); + Assert.AreEqual(1, results2.Count); + Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name); + } + + [TestMethod] + public void Extract_NoCustomExtractors_ReturnsOriginalFile() + { + var extractor = new Extractor(); + + // Don't add any custom extractors + var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); + var results = extractor.Extract("test.custom", testData).ToList(); + + // Should return the original file since no custom extractor is registered + Assert.AreEqual(1, results.Count); + Assert.AreEqual("test.custom", results[0].Name); + } + + [TestMethod] + public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor() + { + var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(extractor); + extractor.AddCustomExtractor(customExtractor); + + // Test with a real ZIP file - should use built-in extractor, not custom + var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip"); + if (File.Exists(path)) + { + var results = extractor.Extract(path).ToList(); + + // Should extract the ZIP normally, not use the custom extractor + Assert.IsTrue(results.Count > 0); + Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile"))); + } + } +} diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs index 20564b5..e0b2979 100644 --- a/RecursiveExtractor/Extractor.cs +++ b/RecursiveExtractor/Extractor.cs @@ -41,6 +41,12 @@ public Extractor() internal Dictionary Extractors { get; } = new Dictionary(); + /// + /// Collection of custom extractors that can handle file types not natively supported. + /// These are checked when a file type is detected as UNKNOWN. + /// + public HashSet CustomExtractors { get; } = new HashSet(); + /// /// Set up the Default Extractors compatible with this platform. /// @@ -97,6 +103,43 @@ public void ClearExtractors() Extractors.Clear(); } + /// + /// Add a custom extractor that can handle file types not natively supported. + /// Custom extractors are checked in the order they were added when a file type is UNKNOWN. + /// + /// The custom extractor implementation to add. + /// True if the extractor was added, false if it was already present. + public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor) + { + if (customExtractor == null) + { + throw new ArgumentNullException(nameof(customExtractor)); + } + return CustomExtractors.Add(customExtractor); + } + + /// + /// Remove a custom extractor. + /// + /// The custom extractor to remove. + /// True if the extractor was removed, false if it was not found. + public bool RemoveCustomExtractor(CustomAsyncExtractorInterface customExtractor) + { + if (customExtractor == null) + { + throw new ArgumentNullException(nameof(customExtractor)); + } + return CustomExtractors.Remove(customExtractor); + } + + /// + /// Remove all custom extractors. + /// + public void ClearCustomExtractors() + { + CustomExtractors.Clear(); + } + /// /// Check if the two files are identical (i.e. Extraction is a quine) /// @@ -308,6 +351,31 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar /// private readonly NLog.Logger Logger = NLog.LogManager.GetCurrentClassLogger(); + /// + /// Finds a custom extractor that can handle the given file entry. + /// + /// The file entry to check. + /// A custom extractor that can handle the file, or null if none found. + private CustomAsyncExtractorInterface? FindMatchingCustomExtractor(FileEntry fileEntry) + { + foreach (var customExtractor in CustomExtractors) + { + try + { + if (customExtractor.CanExtract(fileEntry.Content)) + { + Logger.Debug("Custom extractor {0} matched for file {1}", customExtractor.GetType().Name, fileEntry.FullPath); + return customExtractor; + } + } + catch (Exception e) + { + Logger.Debug("Custom extractor {0} threw exception when checking {1}: {2}", customExtractor.GetType().Name, fileEntry.FullPath, e.Message); + } + } + return null; + } + /// /// Extract asynchronously from a FileEntry. /// @@ -348,13 +416,39 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra var type = fileEntry.ArchiveType; if (options.IsAcceptableType(type)) { - if (((opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) || type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))) + // If this file should be treated as a raw file based on extension, just yield it + if (opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) { if (options.FileNamePasses(fileEntry.FullPath)) { yield return fileEntry; } } + // If type is UNKNOWN or no extractor is registered, check custom extractors + else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + { + // Try to find a custom extractor that can handle this file + var customExtractor = FindMatchingCustomExtractor(fileEntry); + if (customExtractor != null) + { + // Use the custom extractor + await foreach (var result in customExtractor.ExtractAsync(fileEntry, options, resourceGovernor, false)) + { + if (options.FileNamePasses(result.FullPath)) + { + yield return result; + } + } + } + else + { + // No custom extractor found, yield as raw file + if (options.FileNamePasses(fileEntry.FullPath)) + { + yield return fileEntry; + } + } + } else { await foreach (var result in Extractors[type].ExtractAsync(fileEntry, options, resourceGovernor, false)) @@ -640,14 +734,38 @@ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions? opt resourceGovernor.AdjustRemainingBytes(-fileEntry.Content.Length); // If this file should be treated as a raw file, and not extracted, just yield it - if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) || - type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x))) { if (options.FileNamePasses(fileEntry.FullPath)) { yield return fileEntry; } } + // If type is UNKNOWN or no extractor is registered, check custom extractors + else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)) + { + // Try to find a custom extractor that can handle this file + var customExtractor = FindMatchingCustomExtractor(fileEntry); + if (customExtractor != null) + { + // Use the custom extractor + foreach (var extractedResult in customExtractor.Extract(fileEntry, options, resourceGovernor, false)) + { + if (options.FileNamePasses(extractedResult.FullPath)) + { + yield return extractedResult; + } + } + } + else + { + // No custom extractor found, yield as raw file + if (options.FileNamePasses(fileEntry.FullPath)) + { + yield return fileEntry; + } + } + } // Otherwise yield all the results from its extraction else { diff --git a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs b/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs new file mode 100644 index 0000000..d4ccd19 --- /dev/null +++ b/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs @@ -0,0 +1,20 @@ +using System.IO; + +namespace Microsoft.CST.RecursiveExtractor.Extractors +{ + /// + /// An interface for custom extractors that can determine if they can handle a given stream. + /// This allows library users to extend the extractor with support for additional archive types. + /// + public interface CustomAsyncExtractorInterface : AsyncExtractorInterface + { + /// + /// Determines if this extractor can extract the given stream based on binary signatures or other criteria. + /// This method should check the stream's content (similar to how MiniMagic works) and return true if this + /// extractor supports the file format. + /// + /// The stream to check. The implementation should preserve the stream's original position. + /// True if this extractor can handle the stream, false otherwise. + bool CanExtract(Stream stream); + } +} From cf7a481ec5a822805ca7086c9c9aceb2f85dd15a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:47:22 +0000 Subject: [PATCH 3/6] Add documentation for custom extractors to README Co-authored-by: gfs <98900+gfs@users.noreply.github.com> --- README.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/README.md b/README.md index fea318c..d020f58 100644 --- a/README.md +++ b/README.md @@ -162,6 +162,88 @@ catch(OverflowException) ``` +
+Custom Extractors for Additional File Types +
+You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats. + +To create a custom extractor, implement the `CustomAsyncExtractorInterface` interface and register it with the extractor: + +```csharp +using Microsoft.CST.RecursiveExtractor; +using Microsoft.CST.RecursiveExtractor.Extractors; +using System.IO; +using System.Collections.Generic; +using System.Linq; + +// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC" +public class MyCustomExtractor : CustomAsyncExtractorInterface +{ + private readonly Extractor context; + private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC"); + + public MyCustomExtractor(Extractor ctx) + { + context = ctx; + } + + // Check if this extractor can handle the file based on binary signatures + public bool CanExtract(Stream stream) + { + if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length) + { + return false; + } + + var initialPosition = stream.Position; + try + { + stream.Position = 0; + var buffer = new byte[MAGIC_BYTES.Length]; + var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length); + + return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES); + } + finally + { + // Always restore the original position + stream.Position = initialPosition; + } + } + + // Implement extraction logic + public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // Your extraction logic here + // For example, parse the archive and yield FileEntry objects for each contained file + yield break; + } + + public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true) + { + // Your async extraction logic here + yield break; + } +} + +// Register the custom extractor +var extractor = new Extractor(); +var customExtractor = new MyCustomExtractor(extractor); +extractor.AddCustomExtractor(customExtractor); + +// Now the extractor will use your custom extractor for files matching your CanExtract criteria +var results = extractor.Extract("path/to/custom/archive.myarc"); +``` + +Key points: +- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format +- Always preserve the stream's original position in `CanExtract` +- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors) +- Multiple custom extractors can be registered; they are checked in the order they were added +- Custom extractors are invoked for both synchronous and asynchronous extraction paths + +
+ ## Exceptions RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb). Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion. From be008825160ea790e1950b6739f73e766c83ad19 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 20 Nov 2025 16:57:47 +0000 Subject: [PATCH 4/6] Make CustomExtractors property internal per code review feedback Co-authored-by: gfs <98900+gfs@users.noreply.github.com> --- RecursiveExtractor/Extractor.cs | 2 +- RecursiveExtractor/RecursiveExtractor.csproj | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs index e0b2979..700ca2a 100644 --- a/RecursiveExtractor/Extractor.cs +++ b/RecursiveExtractor/Extractor.cs @@ -45,7 +45,7 @@ public Extractor() /// Collection of custom extractors that can handle file types not natively supported. /// These are checked when a file type is detected as UNKNOWN. /// - public HashSet CustomExtractors { get; } = new HashSet(); + internal HashSet CustomExtractors { get; } = new HashSet(); /// /// Set up the Default Extractors compatible with this platform. diff --git a/RecursiveExtractor/RecursiveExtractor.csproj b/RecursiveExtractor/RecursiveExtractor.csproj index 09b0c90..9fdec44 100644 --- a/RecursiveExtractor/RecursiveExtractor.csproj +++ b/RecursiveExtractor/RecursiveExtractor.csproj @@ -24,6 +24,12 @@ snupkg + + + <_Parameter1>RecursiveExtractor.Tests + + + From 387d57981b5562629c69dfe6e8df2e56316323d0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 19:06:06 +0000 Subject: [PATCH 5/6] Rename CustomAsyncExtractorInterface to ICustomAsyncExtractor per code review Co-authored-by: gfs <98900+gfs@users.noreply.github.com> --- README.md | 4 ++-- .../ExtractorTests/CustomExtractorTests.cs | 4 ++-- RecursiveExtractor/Extractor.cs | 8 ++++---- ...syncExtractorInterface.cs => ICustomAsyncExtractor.cs} | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) rename RecursiveExtractor/Extractors/{CustomAsyncExtractorInterface.cs => ICustomAsyncExtractor.cs} (92%) diff --git a/README.md b/README.md index d020f58..7c66029 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ catch(OverflowException)
You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats. -To create a custom extractor, implement the `CustomAsyncExtractorInterface` interface and register it with the extractor: +To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor: ```csharp using Microsoft.CST.RecursiveExtractor; @@ -177,7 +177,7 @@ using System.Collections.Generic; using System.Linq; // Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC" -public class MyCustomExtractor : CustomAsyncExtractorInterface +public class MyCustomExtractor : ICustomAsyncExtractor { private readonly Extractor context; private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC"); diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs index 701a918..cc3e289 100644 --- a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs +++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs @@ -16,7 +16,7 @@ public class CustomExtractorTests /// A simple test custom extractor that extracts files with a specific magic number /// For testing purposes, it recognizes files starting with "CUSTOM1" ///
- private class TestCustomExtractor : CustomAsyncExtractorInterface + private class TestCustomExtractor : ICustomAsyncExtractor { private readonly Extractor context; private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1"); @@ -71,7 +71,7 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra /// /// A second test custom extractor that recognizes files starting with "CUSTOM2" /// - private class SecondTestCustomExtractor : CustomAsyncExtractorInterface + private class SecondTestCustomExtractor : ICustomAsyncExtractor { private readonly Extractor context; private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2"); diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs index 700ca2a..69b9d7c 100644 --- a/RecursiveExtractor/Extractor.cs +++ b/RecursiveExtractor/Extractor.cs @@ -45,7 +45,7 @@ public Extractor() /// Collection of custom extractors that can handle file types not natively supported. /// These are checked when a file type is detected as UNKNOWN. /// - internal HashSet CustomExtractors { get; } = new HashSet(); + internal HashSet CustomExtractors { get; } = new HashSet(); /// /// Set up the Default Extractors compatible with this platform. @@ -109,7 +109,7 @@ public void ClearExtractors() /// /// The custom extractor implementation to add. /// True if the extractor was added, false if it was already present. - public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor) + public bool AddCustomExtractor(ICustomAsyncExtractor customExtractor) { if (customExtractor == null) { @@ -123,7 +123,7 @@ public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor) /// /// The custom extractor to remove. /// True if the extractor was removed, false if it was not found. - public bool RemoveCustomExtractor(CustomAsyncExtractorInterface customExtractor) + public bool RemoveCustomExtractor(ICustomAsyncExtractor customExtractor) { if (customExtractor == null) { @@ -356,7 +356,7 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar /// /// The file entry to check. /// A custom extractor that can handle the file, or null if none found. - private CustomAsyncExtractorInterface? FindMatchingCustomExtractor(FileEntry fileEntry) + private ICustomAsyncExtractor? FindMatchingCustomExtractor(FileEntry fileEntry) { foreach (var customExtractor in CustomExtractors) { diff --git a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs similarity index 92% rename from RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs rename to RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs index d4ccd19..64f2dc0 100644 --- a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs +++ b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs @@ -6,7 +6,7 @@ namespace Microsoft.CST.RecursiveExtractor.Extractors /// An interface for custom extractors that can determine if they can handle a given stream. /// This allows library users to extend the extractor with support for additional archive types. /// - public interface CustomAsyncExtractorInterface : AsyncExtractorInterface + public interface ICustomAsyncExtractor : AsyncExtractorInterface { /// /// Determines if this extractor can extract the given stream based on binary signatures or other criteria. From 199d46abe77031526c4cedb07a069cd123146e59 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 19:15:07 +0000 Subject: [PATCH 6/6] Change CustomExtractors to ICollection and use constructor-based injection Co-authored-by: gfs <98900+gfs@users.noreply.github.com> --- README.md | 10 +- .../ExtractorTests/CustomExtractorTests.cs | 100 +++++------------- RecursiveExtractor/Extractor.cs | 57 ++++------ 3 files changed, 51 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 7c66029..65cccf5 100644 --- a/README.md +++ b/README.md @@ -226,10 +226,9 @@ public class MyCustomExtractor : ICustomAsyncExtractor } } -// Register the custom extractor -var extractor = new Extractor(); -var customExtractor = new MyCustomExtractor(extractor); -extractor.AddCustomExtractor(customExtractor); +// Register the custom extractor via constructor +var customExtractor = new MyCustomExtractor(null); +var extractor = new Extractor(new[] { customExtractor }); // Now the extractor will use your custom extractor for files matching your CanExtract criteria var results = extractor.Extract("path/to/custom/archive.myarc"); @@ -238,8 +237,9 @@ var results = extractor.Extract("path/to/custom/archive.myarc"); Key points: - The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format - Always preserve the stream's original position in `CanExtract` +- Custom extractors are provided via the constructor as an `IEnumerable` - Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors) -- Multiple custom extractors can be registered; they are checked in the order they were added +- Multiple custom extractors can be registered; they are checked in the order provided - Custom extractors are invoked for both synchronous and asynchronous extraction paths diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs index cc3e289..547ba87 100644 --- a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs +++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs @@ -122,81 +122,37 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra } [TestMethod] - public void AddCustomExtractor_ValidExtractor_ReturnsTrue() + public void Constructor_WithCustomExtractors_RegistersExtractors() { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - - var result = extractor.AddCustomExtractor(customExtractor); - - Assert.IsTrue(result); - Assert.AreEqual(1, extractor.CustomExtractors.Count); - } - - [TestMethod] - public void AddCustomExtractor_DuplicateExtractor_ReturnsFalse() - { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - - extractor.AddCustomExtractor(customExtractor); - var result = extractor.AddCustomExtractor(customExtractor); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); - Assert.IsFalse(result); Assert.AreEqual(1, extractor.CustomExtractors.Count); } [TestMethod] - public void AddCustomExtractor_NullExtractor_ThrowsArgumentNullException() + public void Constructor_WithMultipleCustomExtractors_RegistersAll() { - var extractor = new Extractor(); - - Assert.ThrowsException(() => extractor.AddCustomExtractor(null!)); - } - - [TestMethod] - public void RemoveCustomExtractor_ExistingExtractor_ReturnsTrue() - { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - extractor.AddCustomExtractor(customExtractor); - - var result = extractor.RemoveCustomExtractor(customExtractor); + var customExtractor1 = new TestCustomExtractor(null!); + var customExtractor2 = new SecondTestCustomExtractor(null!); + var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 }); - Assert.IsTrue(result); - Assert.AreEqual(0, extractor.CustomExtractors.Count); - } - - [TestMethod] - public void RemoveCustomExtractor_NonExistentExtractor_ReturnsFalse() - { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - - var result = extractor.RemoveCustomExtractor(customExtractor); - - Assert.IsFalse(result); - Assert.AreEqual(0, extractor.CustomExtractors.Count); + Assert.AreEqual(2, extractor.CustomExtractors.Count); } [TestMethod] - public void RemoveCustomExtractor_NullExtractor_ThrowsArgumentNullException() + public void Constructor_WithNullInCollection_IgnoresNull() { - var extractor = new Extractor(); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! }); - Assert.ThrowsException(() => extractor.RemoveCustomExtractor(null!)); + Assert.AreEqual(1, extractor.CustomExtractors.Count); } [TestMethod] - public void ClearCustomExtractors_RemovesAllExtractors() + public void Constructor_WithNullCollection_CreatesEmptyExtractor() { - var extractor = new Extractor(); - extractor.AddCustomExtractor(new TestCustomExtractor(extractor)); - extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor)); - - Assert.AreEqual(2, extractor.CustomExtractors.Count); - - extractor.ClearCustomExtractors(); + var extractor = new Extractor((IEnumerable)null!); Assert.AreEqual(0, extractor.CustomExtractors.Count); } @@ -204,9 +160,8 @@ public void ClearCustomExtractors_RemovesAllExtractors() [TestMethod] public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor() { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - extractor.AddCustomExtractor(customExtractor); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); // Create a test file with the custom magic bytes var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); @@ -225,9 +180,8 @@ public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor() [TestMethod] public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor() { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - extractor.AddCustomExtractor(customExtractor); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); // Create a test file with the custom magic bytes var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data"); @@ -246,9 +200,8 @@ public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor() [TestMethod] public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile() { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - extractor.AddCustomExtractor(customExtractor); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); // Create a test file that doesn't match the custom magic bytes var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data"); @@ -268,9 +221,11 @@ public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile() [TestMethod] public void Extract_MultipleCustomExtractors_UsesCorrectOne() { - var extractor = new Extractor(); - extractor.AddCustomExtractor(new TestCustomExtractor(extractor)); - extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor)); + var extractor = new Extractor(new ICustomAsyncExtractor[] + { + new TestCustomExtractor(null!), + new SecondTestCustomExtractor(null!) + }); // Test with first custom format var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data"); @@ -302,9 +257,8 @@ public void Extract_NoCustomExtractors_ReturnsOriginalFile() [TestMethod] public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor() { - var extractor = new Extractor(); - var customExtractor = new TestCustomExtractor(extractor); - extractor.AddCustomExtractor(customExtractor); + var customExtractor = new TestCustomExtractor(null!); + var extractor = new Extractor(new[] { customExtractor }); // Test with a real ZIP file - should use built-in extractor, not custom var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip"); diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs index 69b9d7c..15dafc5 100644 --- a/RecursiveExtractor/Extractor.cs +++ b/RecursiveExtractor/Extractor.cs @@ -39,13 +39,31 @@ public Extractor() SetDefaultExtractors(); } + /// + /// Instantiate an extractor with the default extractors and custom extractors. + /// + /// Custom extractors to register for handling file types not natively supported. + public Extractor(IEnumerable customExtractors) : this() + { + if (customExtractors != null) + { + foreach (var extractor in customExtractors) + { + if (extractor != null) + { + ((HashSet)CustomExtractors).Add(extractor); + } + } + } + } + internal Dictionary Extractors { get; } = new Dictionary(); /// /// Collection of custom extractors that can handle file types not natively supported. /// These are checked when a file type is detected as UNKNOWN. /// - internal HashSet CustomExtractors { get; } = new HashSet(); + internal ICollection CustomExtractors { get; } = new HashSet(); /// /// Set up the Default Extractors compatible with this platform. @@ -103,43 +121,6 @@ public void ClearExtractors() Extractors.Clear(); } - /// - /// Add a custom extractor that can handle file types not natively supported. - /// Custom extractors are checked in the order they were added when a file type is UNKNOWN. - /// - /// The custom extractor implementation to add. - /// True if the extractor was added, false if it was already present. - public bool AddCustomExtractor(ICustomAsyncExtractor customExtractor) - { - if (customExtractor == null) - { - throw new ArgumentNullException(nameof(customExtractor)); - } - return CustomExtractors.Add(customExtractor); - } - - /// - /// Remove a custom extractor. - /// - /// The custom extractor to remove. - /// True if the extractor was removed, false if it was not found. - public bool RemoveCustomExtractor(ICustomAsyncExtractor customExtractor) - { - if (customExtractor == null) - { - throw new ArgumentNullException(nameof(customExtractor)); - } - return CustomExtractors.Remove(customExtractor); - } - - /// - /// Remove all custom extractors. - /// - public void ClearCustomExtractors() - { - CustomExtractors.Clear(); - } - /// /// Check if the two files are identical (i.e. Extraction is a quine) ///