-
Notifications
You must be signed in to change notification settings - Fork 735
Determine Azure directories more accurately using metadata and blob properties #6428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
6cc7cc5
5e0709c
78ff466
7b93b04
4b3971a
3c931af
eb16a3a
4dfec66
b90fab7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,23 +58,42 @@ class AzFileAttributes implements BasicFileAttributes { | |
objectId = "/${client.containerName}/${client.blobName}" | ||
creationTime = time(props.getCreationTime()) | ||
updateTime = time(props.getLastModified()) | ||
directory = client.blobName.endsWith('/') | ||
size = props.getBlobSize() | ||
|
||
// Support for Azure Data Lake Storage Gen2 with hierarchical namespace enabled | ||
|
||
// Determine if this is a directory using metadata only (most reliable): | ||
final meta = props.getMetadata() | ||
if( meta.containsKey("hdi_isfolder") && size == 0 ){ | ||
directory = meta.get("hdi_isfolder") | ||
if( meta != null && meta.containsKey("hdi_isfolder") && meta.get("hdi_isfolder") == "true" ){ | ||
directory = true | ||
size = 0 | ||
} | ||
else { | ||
// Without metadata, default to treating as file | ||
// This aligns with Azure SDK's approach where explicit directory markers are required | ||
directory = false | ||
size = props.getBlobSize() | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
AzFileAttributes(String containerName, BlobItem item) { | ||
objectId = "/${containerName}/${item.name}" | ||
directory = item.name.endsWith('/') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bentsherman what will break if we incorrectly classify something as a file when it's a pseudo-directory? What's the impact of getting it wrong? |
||
if( !directory ) { | ||
creationTime = time(item.properties.getCreationTime()) | ||
updateTime = time(item.properties.getLastModified()) | ||
size = item.properties.getContentLength() | ||
|
||
// Determine if this is a directory using reliable methods only: | ||
// 1. Check if it's marked as a prefix (virtual directory) - Most reliable | ||
if( item.isPrefix() != null && item.isPrefix() ) { | ||
directory = true | ||
// Virtual directories don't have properties like creation time | ||
size = 0 | ||
} | ||
// 2. Check metadata for hierarchical namespace (ADLS Gen2) | ||
else if( item.getMetadata() != null && item.getMetadata().containsKey("hdi_isfolder") && item.getMetadata().get("hdi_isfolder") == "true" ) { | ||
directory = true | ||
size = 0 | ||
} | ||
// 3. Default: treat as file | ||
else { | ||
directory = false | ||
creationTime = time(item.getProperties().getCreationTime()) | ||
updateTime = time(item.getProperties().getLastModified()) | ||
size = item.getProperties().getContentLength() | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
|
@@ -92,9 +111,41 @@ class AzFileAttributes implements BasicFileAttributes { | |
|
||
protected AzFileAttributes(BlobContainerClient client, String blobName) { | ||
objectId = "/$client.blobContainerName/$blobName" | ||
directory = blobName.endsWith('/') | ||
|
||
if (blobName.endsWith('/')) { | ||
directory = true | ||
size = 0 | ||
return | ||
} | ||
|
||
def blobClient = client.getBlobClient(blobName) | ||
if (blobClient.exists()) { | ||
def props = blobClient.getProperties() | ||
def metadata = props.getMetadata() | ||
|
||
creationTime = time(props.getCreationTime()) | ||
updateTime = time(props.getLastModified()) | ||
|
||
// Determine directory status using multiple indicators | ||
def blobSize = props.getBlobSize() | ||
def hasHdiFolderMetadata = metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true" | ||
def isZeroByteBlob = blobSize == 0 | ||
|
||
// Check for directory indicators in order of reliability | ||
directory = hasHdiFolderMetadata || (isZeroByteBlob && hasChildrenInStorage(client, blobName)) | ||
size = directory ? 0 : blobSize | ||
} else { | ||
// Virtual directory - check if it has children | ||
directory = hasChildrenInStorage(client, blobName) | ||
size = 0 | ||
} | ||
adamrtalbot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
private static boolean hasChildrenInStorage(BlobContainerClient client, String blobName) { | ||
def prefix = blobName.endsWith('/') ? blobName : blobName + '/' | ||
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1) | ||
return client.listBlobs(opts, null).stream().findFirst().isPresent() | ||
} | ||
|
||
static protected FileTime time(Long millis) { | ||
millis ? FileTime.from(millis, TimeUnit.MILLISECONDS) : null | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ import java.nio.file.WatchService | |
import com.azure.storage.blob.BlobClient | ||
import com.azure.storage.blob.BlobContainerClient | ||
import com.azure.storage.blob.models.BlobItem | ||
import com.azure.storage.blob.models.ListBlobsOptions | ||
import groovy.transform.CompileStatic | ||
import groovy.transform.EqualsAndHashCode | ||
import groovy.transform.PackageScope | ||
|
@@ -94,7 +95,34 @@ class AzPath implements Path { | |
} | ||
|
||
boolean isDirectory() { | ||
return directory | ||
if (directory) { | ||
return true | ||
} | ||
|
||
def blobNameStr = blobName() | ||
if (blobNameStr) { | ||
def containerClient = containerClient() | ||
def blobClient = containerClient.getBlobClient(blobNameStr) | ||
|
||
if (blobClient.exists()) { | ||
def props = blobClient.getProperties() | ||
def metadata = props.getMetadata() | ||
def hasHdiFolderMetadata = metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true" | ||
def isZeroByteBlob = props.getBlobSize() == 0 | ||
|
||
return hasHdiFolderMetadata || (isZeroByteBlob && hasChildrenInStorage(containerClient, blobNameStr)) | ||
} else { | ||
return hasChildrenInStorage(containerClient, blobNameStr) | ||
} | ||
} | ||
|
||
return false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Azure API Calls Cause Performance IssuesThe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Trailing Slash Bug Affects Directory DetectionThe Additional Locations (4)
|
||
} | ||
|
||
private boolean hasChildrenInStorage(BlobContainerClient containerClient, String blobNameStr) { | ||
def prefix = blobNameStr.endsWith('/') ? blobNameStr : blobNameStr + '/' | ||
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1) | ||
return containerClient.listBlobs(opts, null).stream().findFirst().isPresent() | ||
} | ||
|
||
String checkContainerName() { | ||
|
@@ -214,8 +242,17 @@ class AzPath implements Path { | |
|
||
@Override | ||
AzPath resolve(String other) { | ||
if( other.startsWith('/') ) | ||
return (AzPath)fs.provider().getPath(new URI("$AzFileSystemProvider.SCHEME:/$other")) | ||
if( other.startsWith('/') ) { | ||
// For absolute paths, throw exception if cross-container rather than creating invalid paths | ||
def otherPath = Paths.get(other) | ||
if( otherPath.isAbsolute() && otherPath.nameCount > 0 ) { | ||
def otherContainer = otherPath.getName(0).toString() | ||
if( otherContainer != fs.containerName ) { | ||
throw new IllegalArgumentException("Cannot resolve cross-container path `$other` from container `${fs.containerName}`") | ||
} | ||
} | ||
return new AzPath(fs, other) | ||
} | ||
|
||
def dir = other.endsWith('/') | ||
def newPath = path.resolve(other) | ||
|
@@ -229,19 +266,35 @@ class AzPath implements Path { | |
|
||
final that = (AzPath)other | ||
def newPath = path.resolveSibling(that.path) | ||
if( newPath.isAbsolute() ) | ||
fs.getPath(newPath.toString()) | ||
else | ||
new AzPath(fs, newPath, false) | ||
if( newPath.isAbsolute() ) { | ||
// Check for cross-container paths and throw exception instead of session context error | ||
if( newPath.nameCount > 0 ) { | ||
def otherContainer = newPath.getName(0).toString() | ||
if( otherContainer != fs.containerName ) { | ||
throw new IllegalArgumentException("Cannot resolve cross-container path `${newPath}` from container `${fs.containerName}`") | ||
} | ||
} | ||
return new AzPath(fs, newPath.toString()) | ||
} else { | ||
return new AzPath(fs, newPath, false) | ||
} | ||
} | ||
|
||
@Override | ||
Path resolveSibling(String other) { | ||
def newPath = path.resolveSibling(other) | ||
if( newPath.isAbsolute() ) | ||
fs.getPath(newPath.toString()) | ||
else | ||
new AzPath(fs, newPath, false) | ||
if( newPath.isAbsolute() ) { | ||
// Check for cross-container paths and throw exception instead of session context error | ||
if( newPath.nameCount > 0 ) { | ||
def otherContainer = newPath.getName(0).toString() | ||
if( otherContainer != fs.containerName ) { | ||
throw new IllegalArgumentException("Cannot resolve cross-container path `${newPath}` from container `${fs.containerName}`") | ||
} | ||
} | ||
return new AzPath(fs, newPath.toString()) | ||
} else { | ||
return new AzPath(fs, newPath, false) | ||
} | ||
} | ||
|
||
@Override | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
package nextflow.cloud.azure.nio | ||
|
||
import com.azure.storage.blob.BlobContainerClient | ||
import spock.lang.Specification | ||
import spock.lang.Unroll | ||
|
||
/** | ||
* Unit tests for AzFileAttributes class | ||
*/ | ||
class AzFileAttributesTest extends Specification { | ||
|
||
def 'should create root attributes correctly'() { | ||
when: | ||
def attrs = AzFileAttributes.root() | ||
|
||
then: | ||
attrs.isDirectory() | ||
!attrs.isRegularFile() | ||
attrs.size() == 0 | ||
attrs.fileKey() == '/' | ||
} | ||
|
||
@Unroll | ||
def 'should validate directory detection with blobName: #blobName'() { | ||
when: | ||
def attrs = new AzFileAttributes() | ||
attrs.objectId = "/test-container/$blobName" | ||
attrs.directory = expectedDirectory | ||
attrs.size = expectedDirectory ? 0 : 100 | ||
|
||
then: | ||
attrs.isDirectory() == expectedDirectory | ||
attrs.isRegularFile() != expectedDirectory | ||
attrs.fileKey().endsWith("/$blobName") | ||
|
||
where: | ||
blobName | expectedDirectory | comment | ||
'problematic-file.txt/' | true | 'Path with trailing slash is directory' | ||
'directory/' | true | 'Path with trailing slash is directory' | ||
'file.log/' | true | 'Path with trailing slash is directory' | ||
'path/to/file.dat/' | true | 'Path with trailing slash is directory' | ||
'/' | true | 'Root slash is directory' | ||
'multiple///' | true | 'Path ending with slashes is directory' | ||
'has.extension.txt/' | true | 'Path with slash is directory regardless of extension' | ||
'log.2024-01-01.txt/' | true | 'Path with slash is directory regardless of extension' | ||
} | ||
|
||
|
||
|
||
def 'should verify equality and hashCode methods work correctly'() { | ||
given: | ||
def attrs1 = AzFileAttributes.root() | ||
def attrs2 = AzFileAttributes.root() | ||
|
||
when: | ||
def equals1 = attrs1.equals(attrs2) | ||
def hash1 = attrs1.hashCode() | ||
def hash2 = attrs2.hashCode() | ||
|
||
then: | ||
equals1 == true | ||
hash1 == hash2 | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.