Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,42 @@ class AzFileAttributes implements BasicFileAttributes {
objectId = "/${client.containerName}/${client.blobName}"
creationTime = time(props.getCreationTime())
updateTime = time(props.getLastModified())
directory = client.blobName.endsWith('/')
size = props.getBlobSize()

// Support for Azure Data Lake Storage Gen2 with hierarchical namespace enabled

// Determine if this is a directory using metadata only (most reliable):
final meta = props.getMetadata()
if( meta.containsKey("hdi_isfolder") && size == 0 ){
directory = meta.get("hdi_isfolder")
if( meta != null && meta.containsKey("hdi_isfolder") && meta.get("hdi_isfolder") == "true" ){
directory = true
size = 0
}
else {
// Without metadata, default to treating as file
// This aligns with Azure SDK's approach where explicit directory markers are required
directory = false
size = props.getBlobSize()
}
}

AzFileAttributes(String containerName, BlobItem item) {
objectId = "/${containerName}/${item.name}"
directory = item.name.endsWith('/')
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bentsherman what will break if we incorrectly classify something as a file when it's a pseudo-directory? What's the impact of getting it wrong?

if( !directory ) {
creationTime = time(item.properties.getCreationTime())
updateTime = time(item.properties.getLastModified())
size = item.properties.getContentLength()

// Determine if this is a directory using reliable methods only:
// 1. Check if it's marked as a prefix (virtual directory) - Most reliable
if( item.isPrefix() != null && item.isPrefix() ) {
directory = true
// Virtual directories don't have properties like creation time
size = 0
}
// 2. Check metadata for hierarchical namespace (ADLS Gen2)
else if( item.getMetadata() != null && item.getMetadata().containsKey("hdi_isfolder") && item.getMetadata().get("hdi_isfolder") == "true" ) {
directory = true
size = 0
}
// 3. Default: treat as file
else {
directory = false
creationTime = time(item.getProperties().getCreationTime())
updateTime = time(item.getProperties().getLastModified())
size = item.getProperties().getContentLength()
}
}

Expand All @@ -92,9 +111,41 @@ class AzFileAttributes implements BasicFileAttributes {

protected AzFileAttributes(BlobContainerClient client, String blobName) {
objectId = "/$client.blobContainerName/$blobName"
directory = blobName.endsWith('/')

if (blobName.endsWith('/')) {
directory = true
size = 0
return
}

def blobClient = client.getBlobClient(blobName)
if (blobClient.exists()) {
def props = blobClient.getProperties()
def metadata = props.getMetadata()

creationTime = time(props.getCreationTime())
updateTime = time(props.getLastModified())

// Determine directory status using multiple indicators
def blobSize = props.getBlobSize()
def hasHdiFolderMetadata = metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true"
def isZeroByteBlob = blobSize == 0

// Check for directory indicators in order of reliability
directory = hasHdiFolderMetadata || (isZeroByteBlob && hasChildrenInStorage(client, blobName))
size = directory ? 0 : blobSize
} else {
// Virtual directory - check if it has children
directory = hasChildrenInStorage(client, blobName)
size = 0
}
}

private static boolean hasChildrenInStorage(BlobContainerClient client, String blobName) {
def prefix = blobName.endsWith('/') ? blobName : blobName + '/'
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1)
return client.listBlobs(opts, null).stream().findFirst().isPresent()
}

static protected FileTime time(Long millis) {
millis ? FileTime.from(millis, TimeUnit.MILLISECONDS) : null
Expand Down
75 changes: 64 additions & 11 deletions plugins/nf-azure/src/main/nextflow/cloud/azure/nio/AzPath.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import java.nio.file.WatchService
import com.azure.storage.blob.BlobClient
import com.azure.storage.blob.BlobContainerClient
import com.azure.storage.blob.models.BlobItem
import com.azure.storage.blob.models.ListBlobsOptions
import groovy.transform.CompileStatic
import groovy.transform.EqualsAndHashCode
import groovy.transform.PackageScope
Expand Down Expand Up @@ -94,7 +95,34 @@ class AzPath implements Path {
}

boolean isDirectory() {
return directory
if (directory) {
return true
}

def blobNameStr = blobName()
if (blobNameStr) {
def containerClient = containerClient()
def blobClient = containerClient.getBlobClient(blobNameStr)

if (blobClient.exists()) {
def props = blobClient.getProperties()
def metadata = props.getMetadata()
def hasHdiFolderMetadata = metadata != null && metadata.containsKey("hdi_isfolder") && metadata.get("hdi_isfolder") == "true"
def isZeroByteBlob = props.getBlobSize() == 0

return hasHdiFolderMetadata || (isZeroByteBlob && hasChildrenInStorage(containerClient, blobNameStr))
} else {
return hasChildrenInStorage(containerClient, blobNameStr)
}
}

return false
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Azure API Calls Cause Performance Issues

The isDirectory() method now makes multiple uncached Azure API calls (exists(), getProperties(), listBlobs()) on every invocation when the directory field is false. This causes a significant performance regression. It also has a logic flaw that can lead to incorrect directory detection and lacks exception handling for these network calls.

Fix in Cursor Fix in Web

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Trailing Slash Bug Affects Directory Detection

The AzPath constructor's handling of trailing slashes causes isDirectory() to return early, bypassing its new Azure-querying logic for directory detection. This also creates unreachable code in the AzFileAttributes constructor and leads to conflicting test expectations regarding paths ending with a slash.

Additional Locations (4)

Fix in Cursor Fix in Web

}

private boolean hasChildrenInStorage(BlobContainerClient containerClient, String blobNameStr) {
def prefix = blobNameStr.endsWith('/') ? blobNameStr : blobNameStr + '/'
def opts = new com.azure.storage.blob.models.ListBlobsOptions().setPrefix(prefix).setMaxResultsPerPage(1)
return containerClient.listBlobs(opts, null).stream().findFirst().isPresent()
}

String checkContainerName() {
Expand Down Expand Up @@ -214,8 +242,17 @@ class AzPath implements Path {

@Override
AzPath resolve(String other) {
if( other.startsWith('/') )
return (AzPath)fs.provider().getPath(new URI("$AzFileSystemProvider.SCHEME:/$other"))
if( other.startsWith('/') ) {
// For absolute paths, throw exception if cross-container rather than creating invalid paths
def otherPath = Paths.get(other)
if( otherPath.isAbsolute() && otherPath.nameCount > 0 ) {
def otherContainer = otherPath.getName(0).toString()
if( otherContainer != fs.containerName ) {
throw new IllegalArgumentException("Cannot resolve cross-container path `$other` from container `${fs.containerName}`")
}
}
return new AzPath(fs, other)
}

def dir = other.endsWith('/')
def newPath = path.resolve(other)
Expand All @@ -229,19 +266,35 @@ class AzPath implements Path {

final that = (AzPath)other
def newPath = path.resolveSibling(that.path)
if( newPath.isAbsolute() )
fs.getPath(newPath.toString())
else
new AzPath(fs, newPath, false)
if( newPath.isAbsolute() ) {
// Check for cross-container paths and throw exception instead of session context error
if( newPath.nameCount > 0 ) {
def otherContainer = newPath.getName(0).toString()
if( otherContainer != fs.containerName ) {
throw new IllegalArgumentException("Cannot resolve cross-container path `${newPath}` from container `${fs.containerName}`")
}
}
return new AzPath(fs, newPath.toString())
} else {
return new AzPath(fs, newPath, false)
}
}

@Override
Path resolveSibling(String other) {
def newPath = path.resolveSibling(other)
if( newPath.isAbsolute() )
fs.getPath(newPath.toString())
else
new AzPath(fs, newPath, false)
if( newPath.isAbsolute() ) {
// Check for cross-container paths and throw exception instead of session context error
if( newPath.nameCount > 0 ) {
def otherContainer = newPath.getName(0).toString()
if( otherContainer != fs.containerName ) {
throw new IllegalArgumentException("Cannot resolve cross-container path `${newPath}` from container `${fs.containerName}`")
}
}
return new AzPath(fs, newPath.toString())
} else {
return new AzPath(fs, newPath, false)
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package nextflow.cloud.azure.nio

import com.azure.storage.blob.BlobContainerClient
import spock.lang.Specification
import spock.lang.Unroll

/**
* Unit tests for AzFileAttributes class
*/
class AzFileAttributesTest extends Specification {

def 'should create root attributes correctly'() {
when:
def attrs = AzFileAttributes.root()

then:
attrs.isDirectory()
!attrs.isRegularFile()
attrs.size() == 0
attrs.fileKey() == '/'
}

@Unroll
def 'should validate directory detection with blobName: #blobName'() {
when:
def attrs = new AzFileAttributes()
attrs.objectId = "/test-container/$blobName"
attrs.directory = expectedDirectory
attrs.size = expectedDirectory ? 0 : 100

then:
attrs.isDirectory() == expectedDirectory
attrs.isRegularFile() != expectedDirectory
attrs.fileKey().endsWith("/$blobName")

where:
blobName | expectedDirectory | comment
'problematic-file.txt/' | true | 'Path with trailing slash is directory'
'directory/' | true | 'Path with trailing slash is directory'
'file.log/' | true | 'Path with trailing slash is directory'
'path/to/file.dat/' | true | 'Path with trailing slash is directory'
'/' | true | 'Root slash is directory'
'multiple///' | true | 'Path ending with slashes is directory'
'has.extension.txt/' | true | 'Path with slash is directory regardless of extension'
'log.2024-01-01.txt/' | true | 'Path with slash is directory regardless of extension'
}



def 'should verify equality and hashCode methods work correctly'() {
given:
def attrs1 = AzFileAttributes.root()
def attrs2 = AzFileAttributes.root()

when:
def equals1 = attrs1.equals(attrs2)
def hash1 = attrs1.hashCode()
def hash2 = attrs2.hashCode()

then:
equals1 == true
hash1 == hash2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -928,4 +928,51 @@ class AzNioTest extends Specification implements AzBaseSpec {
deleteBucket(bucket1)
}

def 'should detect directory with hdi_isfolder metadata' () {
given:
def bucketName = createBucket()
def dirPath = "$bucketName/test-dir"

when:
// Create a directory marker with hdi_isfolder metadata
def containerClient = storageClient.getBlobContainerClient(bucketName)
def blobClient = containerClient.getBlobClient("test-dir/")
blobClient.upload(new ByteArrayInputStream(new byte[0]), 0)
blobClient.setMetadata(['hdi_isfolder': 'true'])

and:
def path = Paths.get(new URI("az://$dirPath/"))
def attrs = Files.readAttributes(path, BasicFileAttributes)

then:
attrs.isDirectory()
!attrs.isRegularFile()

cleanup:
deleteBucket(bucketName)
}

def 'should not treat file with trailing slash as directory without metadata' () {
given:
def bucketName = createBucket()

when:
// Create a file with trailing slash but no directory metadata
def containerClient = storageClient.getBlobContainerClient(bucketName)
def blobClient = containerClient.getBlobClient("test-file/")
blobClient.upload(new ByteArrayInputStream("content".bytes), "content".length())

and:
def path = Paths.get(new URI("az://$bucketName/test-file/"))
def attrs = Files.readAttributes(path, BasicFileAttributes)

then:
// Without metadata or isPrefix, it should be treated as a file
attrs.isRegularFile()
!attrs.isDirectory()

cleanup:
deleteBucket(bucketName)
}

}
Loading