From 997c9ed1b286d7e28b0c29e0084b8fcf1d9d4415 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:33:21 +0000
Subject: [PATCH 1/6] Initial plan
From 879e0a2f8df6942423d853ff6c801eff4a959539 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:43:02 +0000
Subject: [PATCH 2/6] Add custom extractor support with interfaces and tests
Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
---
.../ExtractorTests/CustomExtractorTests.cs | 320 ++++++++++++++++++
RecursiveExtractor/Extractor.cs | 124 ++++++-
.../CustomAsyncExtractorInterface.cs | 20 ++
3 files changed, 461 insertions(+), 3 deletions(-)
create mode 100644 RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
create mode 100644 RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs
diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
new file mode 100644
index 0000000..701a918
--- /dev/null
+++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
@@ -0,0 +1,320 @@
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Threading.Tasks;
+
+namespace RecursiveExtractor.Tests.ExtractorTests;
+
+[TestClass]
+public class CustomExtractorTests
+{
+ ///
+ /// A simple test custom extractor that extracts files with a specific magic number
+ /// For testing purposes, it recognizes files starting with "CUSTOM1"
+ ///
+ private class TestCustomExtractor : CustomAsyncExtractorInterface
+ {
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1");
+
+ public TestCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+ {
+ return true;
+ }
+ return false;
+ }
+ finally
+ {
+ stream.Position = initialPosition;
+ }
+ }
+
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // For this test, we just return a synthetic file entry showing the custom extractor worked
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+ yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // For this test, we just return a synthetic file entry showing the custom extractor worked
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by TestCustomExtractor"));
+ yield return new FileEntry("extracted_from_custom.txt", content, fileEntry);
+ await Task.CompletedTask;
+ }
+ }
+
+ ///
+ /// A second test custom extractor that recognizes files starting with "CUSTOM2"
+ ///
+ private class SecondTestCustomExtractor : CustomAsyncExtractorInterface
+ {
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2");
+
+ public SecondTestCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ if (bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES))
+ {
+ return true;
+ }
+ return false;
+ }
+ finally
+ {
+ stream.Position = initialPosition;
+ }
+ }
+
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+ yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ var content = new MemoryStream(System.Text.Encoding.UTF8.GetBytes("Extracted by SecondTestCustomExtractor"));
+ yield return new FileEntry("extracted_from_second_custom.txt", content, fileEntry);
+ await Task.CompletedTask;
+ }
+ }
+
+ [TestMethod]
+ public void AddCustomExtractor_ValidExtractor_ReturnsTrue()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+
+ var result = extractor.AddCustomExtractor(customExtractor);
+
+ Assert.IsTrue(result);
+ Assert.AreEqual(1, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void AddCustomExtractor_DuplicateExtractor_ReturnsFalse()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+
+ extractor.AddCustomExtractor(customExtractor);
+ var result = extractor.AddCustomExtractor(customExtractor);
+
+ Assert.IsFalse(result);
+ Assert.AreEqual(1, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void AddCustomExtractor_NullExtractor_ThrowsArgumentNullException()
+ {
+ var extractor = new Extractor();
+
+ Assert.ThrowsException(() => extractor.AddCustomExtractor(null!));
+ }
+
+ [TestMethod]
+ public void RemoveCustomExtractor_ExistingExtractor_ReturnsTrue()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+ extractor.AddCustomExtractor(customExtractor);
+
+ var result = extractor.RemoveCustomExtractor(customExtractor);
+
+ Assert.IsTrue(result);
+ Assert.AreEqual(0, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void RemoveCustomExtractor_NonExistentExtractor_ReturnsFalse()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+
+ var result = extractor.RemoveCustomExtractor(customExtractor);
+
+ Assert.IsFalse(result);
+ Assert.AreEqual(0, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void RemoveCustomExtractor_NullExtractor_ThrowsArgumentNullException()
+ {
+ var extractor = new Extractor();
+
+ Assert.ThrowsException(() => extractor.RemoveCustomExtractor(null!));
+ }
+
+ [TestMethod]
+ public void ClearCustomExtractors_RemovesAllExtractors()
+ {
+ var extractor = new Extractor();
+ extractor.AddCustomExtractor(new TestCustomExtractor(extractor));
+ extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor));
+
+ Assert.AreEqual(2, extractor.CustomExtractors.Count);
+
+ extractor.ClearCustomExtractors();
+
+ Assert.AreEqual(0, extractor.CustomExtractors.Count);
+ }
+
+ [TestMethod]
+ public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+ extractor.AddCustomExtractor(customExtractor);
+
+ // Create a test file with the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = extractor.Extract("test.custom", testData).ToList();
+
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+ // Read the content to verify it was processed by our custom extractor
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("Extracted by TestCustomExtractor", content);
+ }
+
+ [TestMethod]
+ public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+ extractor.AddCustomExtractor(customExtractor);
+
+ // Create a test file with the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = await extractor.ExtractAsync("test.custom", testData).ToListAsync();
+
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results[0].Name);
+
+ // Read the content to verify it was processed by our custom extractor
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("Extracted by TestCustomExtractor", content);
+ }
+
+ [TestMethod]
+ public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+ extractor.AddCustomExtractor(customExtractor);
+
+ // Create a test file that doesn't match the custom magic bytes
+ var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data");
+ var results = extractor.Extract("test.txt", testData).ToList();
+
+ // Should return the original file since no custom extractor matched
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("test.txt", results[0].Name);
+
+ // Verify it's the original content
+ using var reader = new StreamReader(results[0].Content);
+ results[0].Content.Position = 0;
+ var content = reader.ReadToEnd();
+ Assert.AreEqual("NOTCUSTOM This is test data", content);
+ }
+
+ [TestMethod]
+ public void Extract_MultipleCustomExtractors_UsesCorrectOne()
+ {
+ var extractor = new Extractor();
+ extractor.AddCustomExtractor(new TestCustomExtractor(extractor));
+ extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor));
+
+ // Test with first custom format
+ var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data");
+ var results1 = extractor.Extract("test1.custom", testData1).ToList();
+ Assert.AreEqual(1, results1.Count);
+ Assert.AreEqual("extracted_from_custom.txt", results1[0].Name);
+
+ // Test with second custom format
+ var testData2 = System.Text.Encoding.ASCII.GetBytes("CUSTOM2 data");
+ var results2 = extractor.Extract("test2.custom", testData2).ToList();
+ Assert.AreEqual(1, results2.Count);
+ Assert.AreEqual("extracted_from_second_custom.txt", results2[0].Name);
+ }
+
+ [TestMethod]
+ public void Extract_NoCustomExtractors_ReturnsOriginalFile()
+ {
+ var extractor = new Extractor();
+
+ // Don't add any custom extractors
+ var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
+ var results = extractor.Extract("test.custom", testData).ToList();
+
+ // Should return the original file since no custom extractor is registered
+ Assert.AreEqual(1, results.Count);
+ Assert.AreEqual("test.custom", results[0].Name);
+ }
+
+ [TestMethod]
+ public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor()
+ {
+ var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(extractor);
+ extractor.AddCustomExtractor(customExtractor);
+
+ // Test with a real ZIP file - should use built-in extractor, not custom
+ var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip");
+ if (File.Exists(path))
+ {
+ var results = extractor.Extract(path).ToList();
+
+ // Should extract the ZIP normally, not use the custom extractor
+ Assert.IsTrue(results.Count > 0);
+ Assert.IsTrue(results.Any(r => r.Name.Contains("EmptyFile")));
+ }
+ }
+}
diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs
index 20564b5..e0b2979 100644
--- a/RecursiveExtractor/Extractor.cs
+++ b/RecursiveExtractor/Extractor.cs
@@ -41,6 +41,12 @@ public Extractor()
internal Dictionary Extractors { get; } = new Dictionary();
+ ///
+ /// Collection of custom extractors that can handle file types not natively supported.
+ /// These are checked when a file type is detected as UNKNOWN.
+ ///
+ public HashSet CustomExtractors { get; } = new HashSet();
+
///
/// Set up the Default Extractors compatible with this platform.
///
@@ -97,6 +103,43 @@ public void ClearExtractors()
Extractors.Clear();
}
+ ///
+ /// Add a custom extractor that can handle file types not natively supported.
+ /// Custom extractors are checked in the order they were added when a file type is UNKNOWN.
+ ///
+ /// The custom extractor implementation to add.
+ /// True if the extractor was added, false if it was already present.
+ public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor)
+ {
+ if (customExtractor == null)
+ {
+ throw new ArgumentNullException(nameof(customExtractor));
+ }
+ return CustomExtractors.Add(customExtractor);
+ }
+
+ ///
+ /// Remove a custom extractor.
+ ///
+ /// The custom extractor to remove.
+ /// True if the extractor was removed, false if it was not found.
+ public bool RemoveCustomExtractor(CustomAsyncExtractorInterface customExtractor)
+ {
+ if (customExtractor == null)
+ {
+ throw new ArgumentNullException(nameof(customExtractor));
+ }
+ return CustomExtractors.Remove(customExtractor);
+ }
+
+ ///
+ /// Remove all custom extractors.
+ ///
+ public void ClearCustomExtractors()
+ {
+ CustomExtractors.Clear();
+ }
+
///
/// Check if the two files are identical (i.e. Extraction is a quine)
///
@@ -308,6 +351,31 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar
///
private readonly NLog.Logger Logger = NLog.LogManager.GetCurrentClassLogger();
+ ///
+ /// Finds a custom extractor that can handle the given file entry.
+ ///
+ /// The file entry to check.
+ /// A custom extractor that can handle the file, or null if none found.
+ private CustomAsyncExtractorInterface? FindMatchingCustomExtractor(FileEntry fileEntry)
+ {
+ foreach (var customExtractor in CustomExtractors)
+ {
+ try
+ {
+ if (customExtractor.CanExtract(fileEntry.Content))
+ {
+ Logger.Debug("Custom extractor {0} matched for file {1}", customExtractor.GetType().Name, fileEntry.FullPath);
+ return customExtractor;
+ }
+ }
+ catch (Exception e)
+ {
+ Logger.Debug("Custom extractor {0} threw exception when checking {1}: {2}", customExtractor.GetType().Name, fileEntry.FullPath, e.Message);
+ }
+ }
+ return null;
+ }
+
///
/// Extract asynchronously from a FileEntry.
///
@@ -348,13 +416,39 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra
var type = fileEntry.ArchiveType;
if (options.IsAcceptableType(type))
{
- if (((opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false) || type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type)))
+ // If this file should be treated as a raw file based on extension, just yield it
+ if (opts?.RawExtensions?.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ?? false)
{
if (options.FileNamePasses(fileEntry.FullPath))
{
yield return fileEntry;
}
}
+ // If type is UNKNOWN or no extractor is registered, check custom extractors
+ else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ {
+ // Try to find a custom extractor that can handle this file
+ var customExtractor = FindMatchingCustomExtractor(fileEntry);
+ if (customExtractor != null)
+ {
+ // Use the custom extractor
+ await foreach (var result in customExtractor.ExtractAsync(fileEntry, options, resourceGovernor, false))
+ {
+ if (options.FileNamePasses(result.FullPath))
+ {
+ yield return result;
+ }
+ }
+ }
+ else
+ {
+ // No custom extractor found, yield as raw file
+ if (options.FileNamePasses(fileEntry.FullPath))
+ {
+ yield return fileEntry;
+ }
+ }
+ }
else
{
await foreach (var result in Extractors[type].ExtractAsync(fileEntry, options, resourceGovernor, false))
@@ -640,14 +734,38 @@ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions? opt
resourceGovernor.AdjustRemainingBytes(-fileEntry.Content.Length);
// If this file should be treated as a raw file, and not extracted, just yield it
- if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)) ||
- type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ if (options.RawExtensions.Any(x => Path.GetExtension(fileEntry.FullPath).Equals(x)))
{
if (options.FileNamePasses(fileEntry.FullPath))
{
yield return fileEntry;
}
}
+ // If type is UNKNOWN or no extractor is registered, check custom extractors
+ else if (type == ArchiveFileType.UNKNOWN || !Extractors.ContainsKey(type))
+ {
+ // Try to find a custom extractor that can handle this file
+ var customExtractor = FindMatchingCustomExtractor(fileEntry);
+ if (customExtractor != null)
+ {
+ // Use the custom extractor
+ foreach (var extractedResult in customExtractor.Extract(fileEntry, options, resourceGovernor, false))
+ {
+ if (options.FileNamePasses(extractedResult.FullPath))
+ {
+ yield return extractedResult;
+ }
+ }
+ }
+ else
+ {
+ // No custom extractor found, yield as raw file
+ if (options.FileNamePasses(fileEntry.FullPath))
+ {
+ yield return fileEntry;
+ }
+ }
+ }
// Otherwise yield all the results from its extraction
else
{
diff --git a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs b/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs
new file mode 100644
index 0000000..d4ccd19
--- /dev/null
+++ b/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs
@@ -0,0 +1,20 @@
+using System.IO;
+
+namespace Microsoft.CST.RecursiveExtractor.Extractors
+{
+ ///
+ /// An interface for custom extractors that can determine if they can handle a given stream.
+ /// This allows library users to extend the extractor with support for additional archive types.
+ ///
+ public interface CustomAsyncExtractorInterface : AsyncExtractorInterface
+ {
+ ///
+ /// Determines if this extractor can extract the given stream based on binary signatures or other criteria.
+ /// This method should check the stream's content (similar to how MiniMagic works) and return true if this
+ /// extractor supports the file format.
+ ///
+ /// The stream to check. The implementation should preserve the stream's original position.
+ /// True if this extractor can handle the stream, false otherwise.
+ bool CanExtract(Stream stream);
+ }
+}
From cf7a481ec5a822805ca7086c9c9aceb2f85dd15a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:47:22 +0000
Subject: [PATCH 3/6] Add documentation for custom extractors to README
Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
---
README.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 82 insertions(+)
diff --git a/README.md b/README.md
index fea318c..d020f58 100644
--- a/README.md
+++ b/README.md
@@ -162,6 +162,88 @@ catch(OverflowException)
```
+
+Custom Extractors for Additional File Types
+
+You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats.
+
+To create a custom extractor, implement the `CustomAsyncExtractorInterface` interface and register it with the extractor:
+
+```csharp
+using Microsoft.CST.RecursiveExtractor;
+using Microsoft.CST.RecursiveExtractor.Extractors;
+using System.IO;
+using System.Collections.Generic;
+using System.Linq;
+
+// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC"
+public class MyCustomExtractor : CustomAsyncExtractorInterface
+{
+ private readonly Extractor context;
+ private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC");
+
+ public MyCustomExtractor(Extractor ctx)
+ {
+ context = ctx;
+ }
+
+ // Check if this extractor can handle the file based on binary signatures
+ public bool CanExtract(Stream stream)
+ {
+ if (stream == null || !stream.CanRead || !stream.CanSeek || stream.Length < MAGIC_BYTES.Length)
+ {
+ return false;
+ }
+
+ var initialPosition = stream.Position;
+ try
+ {
+ stream.Position = 0;
+ var buffer = new byte[MAGIC_BYTES.Length];
+ var bytesRead = stream.Read(buffer, 0, MAGIC_BYTES.Length);
+
+ return bytesRead == MAGIC_BYTES.Length && buffer.SequenceEqual(MAGIC_BYTES);
+ }
+ finally
+ {
+ // Always restore the original position
+ stream.Position = initialPosition;
+ }
+ }
+
+ // Implement extraction logic
+ public IEnumerable Extract(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // Your extraction logic here
+ // For example, parse the archive and yield FileEntry objects for each contained file
+ yield break;
+ }
+
+ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, ExtractorOptions options, ResourceGovernor governor, bool topLevel = true)
+ {
+ // Your async extraction logic here
+ yield break;
+ }
+}
+
+// Register the custom extractor
+var extractor = new Extractor();
+var customExtractor = new MyCustomExtractor(extractor);
+extractor.AddCustomExtractor(customExtractor);
+
+// Now the extractor will use your custom extractor for files matching your CanExtract criteria
+var results = extractor.Extract("path/to/custom/archive.myarc");
+```
+
+Key points:
+- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format
+- Always preserve the stream's original position in `CanExtract`
+- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors)
+- Multiple custom extractors can be registered; they are checked in the order they were added
+- Custom extractors are invoked for both synchronous and asynchronous extraction paths
+
+
+
## Exceptions
RecursiveExtractor protects against [ZipSlip](https://snyk.io/research/zip-slip-vulnerability), [Quines, and Zip Bombs](https://en.wikipedia.org/wiki/Zip_bomb).
Calls to Extract will throw an `OverflowException` when a Quine or Zip bomb is detected and a `TimeOutException` if `EnableTiming` is set and the specified time period has elapsed before completion.
From be008825160ea790e1950b6739f73e766c83ad19 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 20 Nov 2025 16:57:47 +0000
Subject: [PATCH 4/6] Make CustomExtractors property internal per code review
feedback
Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
---
RecursiveExtractor/Extractor.cs | 2 +-
RecursiveExtractor/RecursiveExtractor.csproj | 6 ++++++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs
index e0b2979..700ca2a 100644
--- a/RecursiveExtractor/Extractor.cs
+++ b/RecursiveExtractor/Extractor.cs
@@ -45,7 +45,7 @@ public Extractor()
/// Collection of custom extractors that can handle file types not natively supported.
/// These are checked when a file type is detected as UNKNOWN.
///
- public HashSet CustomExtractors { get; } = new HashSet();
+ internal HashSet CustomExtractors { get; } = new HashSet();
///
/// Set up the Default Extractors compatible with this platform.
diff --git a/RecursiveExtractor/RecursiveExtractor.csproj b/RecursiveExtractor/RecursiveExtractor.csproj
index 09b0c90..9fdec44 100644
--- a/RecursiveExtractor/RecursiveExtractor.csproj
+++ b/RecursiveExtractor/RecursiveExtractor.csproj
@@ -24,6 +24,12 @@
snupkg
+
+
+ <_Parameter1>RecursiveExtractor.Tests
+
+
+
From 387d57981b5562629c69dfe6e8df2e56316323d0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:06:06 +0000
Subject: [PATCH 5/6] Rename CustomAsyncExtractorInterface to
ICustomAsyncExtractor per code review
Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
---
README.md | 4 ++--
.../ExtractorTests/CustomExtractorTests.cs | 4 ++--
RecursiveExtractor/Extractor.cs | 8 ++++----
...syncExtractorInterface.cs => ICustomAsyncExtractor.cs} | 2 +-
4 files changed, 9 insertions(+), 9 deletions(-)
rename RecursiveExtractor/Extractors/{CustomAsyncExtractorInterface.cs => ICustomAsyncExtractor.cs} (92%)
diff --git a/README.md b/README.md
index d020f58..7c66029 100644
--- a/README.md
+++ b/README.md
@@ -167,7 +167,7 @@ catch(OverflowException)
You can extend RecursiveExtractor with custom extractors to support additional archive or file formats not natively supported. This is useful for formats like MSI, MSP, or other proprietary archive formats.
-To create a custom extractor, implement the `CustomAsyncExtractorInterface` interface and register it with the extractor:
+To create a custom extractor, implement the `ICustomAsyncExtractor` interface and register it with the extractor:
```csharp
using Microsoft.CST.RecursiveExtractor;
@@ -177,7 +177,7 @@ using System.Collections.Generic;
using System.Linq;
// Example: Custom extractor for a hypothetical archive format with magic bytes "MYARC"
-public class MyCustomExtractor : CustomAsyncExtractorInterface
+public class MyCustomExtractor : ICustomAsyncExtractor
{
private readonly Extractor context;
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("MYARC");
diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
index 701a918..cc3e289 100644
--- a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
+++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
@@ -16,7 +16,7 @@ public class CustomExtractorTests
/// A simple test custom extractor that extracts files with a specific magic number
/// For testing purposes, it recognizes files starting with "CUSTOM1"
///
- private class TestCustomExtractor : CustomAsyncExtractorInterface
+ private class TestCustomExtractor : ICustomAsyncExtractor
{
private readonly Extractor context;
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM1");
@@ -71,7 +71,7 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra
///
/// A second test custom extractor that recognizes files starting with "CUSTOM2"
///
- private class SecondTestCustomExtractor : CustomAsyncExtractorInterface
+ private class SecondTestCustomExtractor : ICustomAsyncExtractor
{
private readonly Extractor context;
private static readonly byte[] MAGIC_BYTES = System.Text.Encoding.ASCII.GetBytes("CUSTOM2");
diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs
index 700ca2a..69b9d7c 100644
--- a/RecursiveExtractor/Extractor.cs
+++ b/RecursiveExtractor/Extractor.cs
@@ -45,7 +45,7 @@ public Extractor()
/// Collection of custom extractors that can handle file types not natively supported.
/// These are checked when a file type is detected as UNKNOWN.
///
- internal HashSet CustomExtractors { get; } = new HashSet();
+ internal HashSet CustomExtractors { get; } = new HashSet();
///
/// Set up the Default Extractors compatible with this platform.
@@ -109,7 +109,7 @@ public void ClearExtractors()
///
/// The custom extractor implementation to add.
/// True if the extractor was added, false if it was already present.
- public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor)
+ public bool AddCustomExtractor(ICustomAsyncExtractor customExtractor)
{
if (customExtractor == null)
{
@@ -123,7 +123,7 @@ public bool AddCustomExtractor(CustomAsyncExtractorInterface customExtractor)
///
/// The custom extractor to remove.
/// True if the extractor was removed, false if it was not found.
- public bool RemoveCustomExtractor(CustomAsyncExtractorInterface customExtractor)
+ public bool RemoveCustomExtractor(ICustomAsyncExtractor customExtractor)
{
if (customExtractor == null)
{
@@ -356,7 +356,7 @@ public async IAsyncEnumerable ExtractAsync(string filename, byte[] ar
///
/// The file entry to check.
/// A custom extractor that can handle the file, or null if none found.
- private CustomAsyncExtractorInterface? FindMatchingCustomExtractor(FileEntry fileEntry)
+ private ICustomAsyncExtractor? FindMatchingCustomExtractor(FileEntry fileEntry)
{
foreach (var customExtractor in CustomExtractors)
{
diff --git a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs
similarity index 92%
rename from RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs
rename to RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs
index d4ccd19..64f2dc0 100644
--- a/RecursiveExtractor/Extractors/CustomAsyncExtractorInterface.cs
+++ b/RecursiveExtractor/Extractors/ICustomAsyncExtractor.cs
@@ -6,7 +6,7 @@ namespace Microsoft.CST.RecursiveExtractor.Extractors
/// An interface for custom extractors that can determine if they can handle a given stream.
/// This allows library users to extend the extractor with support for additional archive types.
///
- public interface CustomAsyncExtractorInterface : AsyncExtractorInterface
+ public interface ICustomAsyncExtractor : AsyncExtractorInterface
{
///
/// Determines if this extractor can extract the given stream based on binary signatures or other criteria.
From 199d46abe77031526c4cedb07a069cd123146e59 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 19:15:07 +0000
Subject: [PATCH 6/6] Change CustomExtractors to ICollection and use
constructor-based injection
Co-authored-by: gfs <98900+gfs@users.noreply.github.com>
---
README.md | 10 +-
.../ExtractorTests/CustomExtractorTests.cs | 100 +++++-------------
RecursiveExtractor/Extractor.cs | 57 ++++------
3 files changed, 51 insertions(+), 116 deletions(-)
diff --git a/README.md b/README.md
index 7c66029..65cccf5 100644
--- a/README.md
+++ b/README.md
@@ -226,10 +226,9 @@ public class MyCustomExtractor : ICustomAsyncExtractor
}
}
-// Register the custom extractor
-var extractor = new Extractor();
-var customExtractor = new MyCustomExtractor(extractor);
-extractor.AddCustomExtractor(customExtractor);
+// Register the custom extractor via constructor
+var customExtractor = new MyCustomExtractor(null);
+var extractor = new Extractor(new[] { customExtractor });
// Now the extractor will use your custom extractor for files matching your CanExtract criteria
var results = extractor.Extract("path/to/custom/archive.myarc");
@@ -238,8 +237,9 @@ var results = extractor.Extract("path/to/custom/archive.myarc");
Key points:
- The `CanExtract` method should check the stream's binary signature (like MiniMagic does) and return true if this extractor can handle the format
- Always preserve the stream's original position in `CanExtract`
+- Custom extractors are provided via the constructor as an `IEnumerable`
- Custom extractors are only checked when the file type is UNKNOWN (not recognized by built-in extractors)
-- Multiple custom extractors can be registered; they are checked in the order they were added
+- Multiple custom extractors can be registered; they are checked in the order provided
- Custom extractors are invoked for both synchronous and asynchronous extraction paths
diff --git a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
index cc3e289..547ba87 100644
--- a/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
+++ b/RecursiveExtractor.Tests/ExtractorTests/CustomExtractorTests.cs
@@ -122,81 +122,37 @@ public async IAsyncEnumerable ExtractAsync(FileEntry fileEntry, Extra
}
[TestMethod]
- public void AddCustomExtractor_ValidExtractor_ReturnsTrue()
+ public void Constructor_WithCustomExtractors_RegistersExtractors()
{
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
-
- var result = extractor.AddCustomExtractor(customExtractor);
-
- Assert.IsTrue(result);
- Assert.AreEqual(1, extractor.CustomExtractors.Count);
- }
-
- [TestMethod]
- public void AddCustomExtractor_DuplicateExtractor_ReturnsFalse()
- {
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
-
- extractor.AddCustomExtractor(customExtractor);
- var result = extractor.AddCustomExtractor(customExtractor);
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
- Assert.IsFalse(result);
Assert.AreEqual(1, extractor.CustomExtractors.Count);
}
[TestMethod]
- public void AddCustomExtractor_NullExtractor_ThrowsArgumentNullException()
+ public void Constructor_WithMultipleCustomExtractors_RegistersAll()
{
- var extractor = new Extractor();
-
- Assert.ThrowsException(() => extractor.AddCustomExtractor(null!));
- }
-
- [TestMethod]
- public void RemoveCustomExtractor_ExistingExtractor_ReturnsTrue()
- {
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
- extractor.AddCustomExtractor(customExtractor);
-
- var result = extractor.RemoveCustomExtractor(customExtractor);
+ var customExtractor1 = new TestCustomExtractor(null!);
+ var customExtractor2 = new SecondTestCustomExtractor(null!);
+ var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor1, customExtractor2 });
- Assert.IsTrue(result);
- Assert.AreEqual(0, extractor.CustomExtractors.Count);
- }
-
- [TestMethod]
- public void RemoveCustomExtractor_NonExistentExtractor_ReturnsFalse()
- {
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
-
- var result = extractor.RemoveCustomExtractor(customExtractor);
-
- Assert.IsFalse(result);
- Assert.AreEqual(0, extractor.CustomExtractors.Count);
+ Assert.AreEqual(2, extractor.CustomExtractors.Count);
}
[TestMethod]
- public void RemoveCustomExtractor_NullExtractor_ThrowsArgumentNullException()
+ public void Constructor_WithNullInCollection_IgnoresNull()
{
- var extractor = new Extractor();
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new ICustomAsyncExtractor[] { customExtractor, null! });
- Assert.ThrowsException(() => extractor.RemoveCustomExtractor(null!));
+ Assert.AreEqual(1, extractor.CustomExtractors.Count);
}
[TestMethod]
- public void ClearCustomExtractors_RemovesAllExtractors()
+ public void Constructor_WithNullCollection_CreatesEmptyExtractor()
{
- var extractor = new Extractor();
- extractor.AddCustomExtractor(new TestCustomExtractor(extractor));
- extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor));
-
- Assert.AreEqual(2, extractor.CustomExtractors.Count);
-
- extractor.ClearCustomExtractors();
+ var extractor = new Extractor((IEnumerable)null!);
Assert.AreEqual(0, extractor.CustomExtractors.Count);
}
@@ -204,9 +160,8 @@ public void ClearCustomExtractors_RemovesAllExtractors()
[TestMethod]
public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
{
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
- extractor.AddCustomExtractor(customExtractor);
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
// Create a test file with the custom magic bytes
var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
@@ -225,9 +180,8 @@ public void Extract_WithMatchingCustomExtractor_UsesCustomExtractor()
[TestMethod]
public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
{
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
- extractor.AddCustomExtractor(customExtractor);
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
// Create a test file with the custom magic bytes
var testData = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 This is test data");
@@ -246,9 +200,8 @@ public async Task ExtractAsync_WithMatchingCustomExtractor_UsesCustomExtractor()
[TestMethod]
public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
{
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
- extractor.AddCustomExtractor(customExtractor);
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
// Create a test file that doesn't match the custom magic bytes
var testData = System.Text.Encoding.ASCII.GetBytes("NOTCUSTOM This is test data");
@@ -268,9 +221,11 @@ public void Extract_WithoutMatchingCustomExtractor_ReturnsOriginalFile()
[TestMethod]
public void Extract_MultipleCustomExtractors_UsesCorrectOne()
{
- var extractor = new Extractor();
- extractor.AddCustomExtractor(new TestCustomExtractor(extractor));
- extractor.AddCustomExtractor(new SecondTestCustomExtractor(extractor));
+ var extractor = new Extractor(new ICustomAsyncExtractor[]
+ {
+ new TestCustomExtractor(null!),
+ new SecondTestCustomExtractor(null!)
+ });
// Test with first custom format
var testData1 = System.Text.Encoding.ASCII.GetBytes("CUSTOM1 data");
@@ -302,9 +257,8 @@ public void Extract_NoCustomExtractors_ReturnsOriginalFile()
[TestMethod]
public void Extract_CustomExtractorForKnownFormat_UsesBuiltInExtractor()
{
- var extractor = new Extractor();
- var customExtractor = new TestCustomExtractor(extractor);
- extractor.AddCustomExtractor(customExtractor);
+ var customExtractor = new TestCustomExtractor(null!);
+ var extractor = new Extractor(new[] { customExtractor });
// Test with a real ZIP file - should use built-in extractor, not custom
var path = Path.Combine(Directory.GetCurrentDirectory(), "TestData", "TestDataArchives", "EmptyFile.txt.zip");
diff --git a/RecursiveExtractor/Extractor.cs b/RecursiveExtractor/Extractor.cs
index 69b9d7c..15dafc5 100644
--- a/RecursiveExtractor/Extractor.cs
+++ b/RecursiveExtractor/Extractor.cs
@@ -39,13 +39,31 @@ public Extractor()
SetDefaultExtractors();
}
+ ///
+ /// Instantiate an extractor with the default extractors and custom extractors.
+ ///
+ /// Custom extractors to register for handling file types not natively supported.
+ public Extractor(IEnumerable customExtractors) : this()
+ {
+ if (customExtractors != null)
+ {
+ foreach (var extractor in customExtractors)
+ {
+ if (extractor != null)
+ {
+ ((HashSet)CustomExtractors).Add(extractor);
+ }
+ }
+ }
+ }
+
internal Dictionary Extractors { get; } = new Dictionary();
///
/// Collection of custom extractors that can handle file types not natively supported.
/// These are checked when a file type is detected as UNKNOWN.
///
- internal HashSet CustomExtractors { get; } = new HashSet();
+ internal ICollection CustomExtractors { get; } = new HashSet();
///
/// Set up the Default Extractors compatible with this platform.
@@ -103,43 +121,6 @@ public void ClearExtractors()
Extractors.Clear();
}
- ///
- /// Add a custom extractor that can handle file types not natively supported.
- /// Custom extractors are checked in the order they were added when a file type is UNKNOWN.
- ///
- /// The custom extractor implementation to add.
- /// True if the extractor was added, false if it was already present.
- public bool AddCustomExtractor(ICustomAsyncExtractor customExtractor)
- {
- if (customExtractor == null)
- {
- throw new ArgumentNullException(nameof(customExtractor));
- }
- return CustomExtractors.Add(customExtractor);
- }
-
- ///
- /// Remove a custom extractor.
- ///
- /// The custom extractor to remove.
- /// True if the extractor was removed, false if it was not found.
- public bool RemoveCustomExtractor(ICustomAsyncExtractor customExtractor)
- {
- if (customExtractor == null)
- {
- throw new ArgumentNullException(nameof(customExtractor));
- }
- return CustomExtractors.Remove(customExtractor);
- }
-
- ///
- /// Remove all custom extractors.
- ///
- public void ClearCustomExtractors()
- {
- CustomExtractors.Clear();
- }
-
///
/// Check if the two files are identical (i.e. Extraction is a quine)
///