Skip to content

Commit

Permalink
vfs: fix unicode normalization on macOS - fixes rclone#7072
Browse files Browse the repository at this point in the history
Before this change, the VFS layer did not properly handle unicode normalization,
which caused problems particularly for users of macOS. While attempts were made
to handle it with various `-o modules=iconv` combinations, this was an imperfect
solution, as no one combination allowed both NFC and NFD content to
simultaneously be both visible and editable via Finder.

After this change, the VFS supports `--no-unicode-normalization` (default `false`)
via the existing `--vfs-case-insensitive` logic, which is extended to apply to both
case insensitivity and unicode normalization form.

This change also adds an additional flag, `--vfs-block-norm-dupes`, to address a
probably rare but potentially possible scenario where a directory contains
multiple duplicate filenames after applying case and unicode normalization
settings. In such a scenario, this flag (disabled by default) hides the
duplicates. This comes with a performance tradeoff, as rclone will have to scan
the entire directory for duplicates when listing a directory. For this reason,
it is recommended to leave this disabled if not needed. However, macOS users may
wish to consider using it, as otherwise, if a remote directory contains both NFC
and NFD versions of the same filename, an odd situation will occur: both
versions of the file will be visible in the mount, and both will appear to be
editable, however, editing either version will actually result in only the NFD
version getting edited under the hood. `--vfs-block-norm-dupes` prevents this
confusion by detecting this scenario, hiding the duplicates, and logging an
error, similar to how this is handled in `rclone sync`.
  • Loading branch information
nielash committed Mar 4, 2024
1 parent 91b54aa commit 77c8596
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 24 deletions.
7 changes: 0 additions & 7 deletions cmd/cmount/mount.go
Expand Up @@ -116,13 +116,6 @@ func mountOptions(VFS *vfs.VFS, device string, mountpoint string, opt *mountlib.
for _, option := range opt.ExtraFlags {
options = append(options, option)
}
if runtime.GOOS == "darwin" {
if !findOption("modules=iconv", options) {
iconv := "modules=iconv,from_code=UTF-8,to_code=UTF-8-MAC"
options = append(options, "-o", iconv)
fs.Debugf(nil, "Adding \"-o %s\" for macOS", iconv)
}
}
return options
}

Expand Down
16 changes: 6 additions & 10 deletions cmd/mountlib/mount.md
Expand Up @@ -257,7 +257,12 @@ Mounting on macOS can be done either via [built-in NFS server](/commands/rclone_
FUSE driver utilizing a macOS kernel extension (kext). FUSE-T is an alternative FUSE system
which "mounts" via an NFSv4 local server.

## NFS mount
##### Unicode Normalization

It is highly recommended to keep the default of `--no-unicode-normalization=false`
for all `mount` and `serve` commands on macOS. For details, see [vfs-case-sensitivity](https://rclone.org/commands/rclone_mount/#vfs-case-sensitivity).

#### NFS mount

This method spins up an NFS server using [serve nfs](/commands/rclone_serve_nfs/) command and mounts
it to the specified mountpoint. If you run this in background mode using |--daemon|, you will need to
Expand Down Expand Up @@ -290,15 +295,6 @@ As per the [FUSE-T wiki](https://github.com/macos-fuse-t/fuse-t/wiki#caveats):
This means that viewing files with various tools, notably macOS Finder, will cause rlcone
to update the modification time of the file. This may make rclone upload a full new copy
of the file.

##### Unicode Normalization

Rclone includes flags for unicode normalization with macFUSE that should be updated
for FUSE-T. See [this forum post](https://forum.rclone.org/t/some-unicode-forms-break-mount-on-macos-with-fuse-t/36403)
and [FUSE-T issue #16](https://github.com/macos-fuse-t/fuse-t/issues/16). The following
flag should be added to the `rclone mount` command.

-o modules=iconv,from_code=UTF-8,to_code=UTF-8

##### Read Only mounts

Expand Down
50 changes: 43 additions & 7 deletions vfs/dir.go
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/rclone/rclone/fs/operations"
"github.com/rclone/rclone/fs/walk"
"github.com/rclone/rclone/vfs/vfscommon"
"golang.org/x/text/unicode/norm"
)

// Dir represents a directory entry
Expand Down Expand Up @@ -466,7 +467,6 @@ func (d *Dir) AddVirtual(leaf string, size int64, isDir bool) {
node = f
}
d.addObject(node)

}

// delObject removes an object from the directory
Expand Down Expand Up @@ -514,6 +514,38 @@ func (d *Dir) _readDir() error {
return err
}

if d.vfs.Opt.BlockNormDupes { // do this only if requested, as it will have a performance hit
ctx, ci := fs.AddConfig(context.TODO())
if d.vfs.Opt.CaseInsensitive {
ci.IgnoreCaseSync = true
}

// sort entries such that NFD comes before NFC of same name
sort.Slice(entries, func(i, j int) bool {
if entries[i] != entries[j] && fs.DirEntryType(entries[i]) == fs.DirEntryType(entries[j]) && norm.NFC.String(entries[i].Remote()) == norm.NFC.String(entries[j].Remote()) {
if norm.NFD.IsNormalString(entries[i].Remote()) && !norm.NFD.IsNormalString(entries[j].Remote()) {
return true
}
}
return entries.Less(i, j)
})

// detect dupes, remove them from the list and log an error
normalizedNames := make(map[string]struct{}, entries.Len())
filteredEntries := make(fs.DirEntries, 0)
for _, e := range entries {
normName := fmt.Sprintf("%s-%T", operations.ApplyTransforms(ctx, e.Remote()), e) // include type to track objects and dirs separately
_, found := normalizedNames[normName]
if found {
fs.Errorf(e.Remote(), "duplicate normalized names detected - skipping")
continue
}
normalizedNames[normName] = struct{}{}
filteredEntries = append(filteredEntries, e)
}
entries = filteredEntries
}

err = d._readDirFromEntries(entries, nil, time.Time{})
if err != nil {
return err
Expand Down Expand Up @@ -767,15 +799,19 @@ func (d *Dir) stat(leaf string) (Node, error) {
}
item, ok := d.items[leaf]

if !ok && d.vfs.Opt.CaseInsensitive {
leafLower := strings.ToLower(leaf)
ctx, ci := fs.AddConfig(context.TODO())
if !ok && (d.vfs.Opt.CaseInsensitive || ci.IgnoreCaseSync || !ci.NoUnicodeNormalization) {
if d.vfs.Opt.CaseInsensitive {
ci.IgnoreCaseSync = true
}
leafNormalized := operations.ApplyTransforms(ctx, leaf) // this handles both case and unicode normalization
for name, node := range d.items {
if strings.ToLower(name) == leafLower {
if operations.ApplyTransforms(ctx, name) == leafNormalized {
if ok {
// duplicate case insensitive match is an error
return nil, fmt.Errorf("duplicate filename %q detected with --vfs-case-insensitive set", leaf)
// duplicate normalized match is an error
return nil, fmt.Errorf("duplicate filename %q detected with case/unicode normalization settings", leaf)
}
// found a case insensitive match
// found a normalized match
ok = true
item = node
}
Expand Down
22 changes: 22 additions & 0 deletions vfs/vfs.md
Expand Up @@ -309,6 +309,28 @@ If the flag is not provided on the command line, then its default value depends
on the operating system where rclone runs: "true" on Windows and macOS, "false"
otherwise. If the flag is provided without a value, then it is "true".

The `--no-unicode-normalization` flag controls whether a similar "fixup" is
performed for filenames that differ but are [canonically
equivalent](https://en.wikipedia.org/wiki/Unicode_equivalence) with respect to
unicode. Unicode normalization can be particularly helpful for users of macOS,
which prefers form NFD instead of the NFC used by most other platforms. It is
therefore highly recommended to keep the default of `false` on macOS, to avoid
encoding compatibility issues.

In the (probably unlikely) event that a directory has multiple duplicate
filenames after applying case and unicode normalization, the `--vfs-block-norm-dupes`
flag allows hiding these duplicates. This comes with a performance tradeoff, as
rclone will have to scan the entire directory for duplicates when listing a
directory. For this reason, it is recommended to leave this disabled if not
needed. However, macOS users may wish to consider using it, as otherwise, if a
remote directory contains both NFC and NFD versions of the same filename, an odd
situation will occur: both versions of the file will be visible in the mount,
and both will appear to be editable, however, editing either version will
actually result in only the NFD version getting edited under the hood. `--vfs-block-
norm-dupes` prevents this confusion by detecting this scenario, hiding the
duplicates, and logging an error, similar to how this is handled in `rclone
sync`.

### VFS Disk Options

This flag allows you to manually set the statistics about the filing system.
Expand Down
35 changes: 35 additions & 0 deletions vfs/vfs_case_test.go
Expand Up @@ -5,10 +5,12 @@ import (
"os"
"testing"

"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fstest"
"github.com/rclone/rclone/vfs/vfscommon"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/unicode/norm"
)

func TestCaseSensitivity(t *testing.T) {
Expand Down Expand Up @@ -160,3 +162,36 @@ func assertFileAbsentVFS(t *testing.T, vfs *VFS, name string) {
assert.Error(t, err)
assert.Equal(t, err, ENOENT)
}

func TestUnicodeNormalization(t *testing.T) {
r := fstest.NewRun(t)

var (
nfc = norm.NFC.String(norm.NFD.String("測試_Русский___ě_áñ"))
nfd = norm.NFD.String(nfc)
both = "normal name with no special characters.txt"
)

// Create test files
ctx := context.Background()
file1 := r.WriteObject(ctx, both, "data1", t1)
file2 := r.WriteObject(ctx, nfc, "data2", t2)
r.CheckRemoteItems(t, file1, file2)

// Create VFS
opt := vfscommon.DefaultOpt
vfs := New(r.Fremote, &opt)
defer cleanupVFS(t, vfs)

// assert that both files are found under NFD-normalized names
assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1")
assertFileDataVFS(t, vfs, nfd, "data2")

// change ci.NoUnicodeNormalization to true and verify that only file1 is found
ci := fs.GetConfig(ctx) // need to set the global config here as the *Dir methods don't take a ctx param
oldVal := ci.NoUnicodeNormalization
defer func() { fs.GetConfig(ctx).NoUnicodeNormalization = oldVal }() // restore the prior value after the test
ci.NoUnicodeNormalization = true
assertFileDataVFS(t, vfs, norm.NFD.String(both), "data1")
assertFileAbsentVFS(t, vfs, nfd)
}
1 change: 1 addition & 0 deletions vfs/vfscommon/options.go
Expand Up @@ -30,6 +30,7 @@ type Options struct {
CacheMinFreeSpace fs.SizeSuffix
CachePollInterval time.Duration
CaseInsensitive bool
BlockNormDupes bool
WriteWait time.Duration // time to wait for in-sequence write
ReadWait time.Duration // time to wait for in-sequence read
WriteBack time.Duration // time to wait before writing back dirty files
Expand Down
1 change: 1 addition & 0 deletions vfs/vfsflags/vfsflags.go
Expand Up @@ -35,6 +35,7 @@ func AddFlags(flagSet *pflag.FlagSet) {
flags.FVarP(flagSet, DirPerms, "dir-perms", "", "Directory permissions", "VFS")
flags.FVarP(flagSet, FilePerms, "file-perms", "", "File permissions", "VFS")
flags.BoolVarP(flagSet, &Opt.CaseInsensitive, "vfs-case-insensitive", "", Opt.CaseInsensitive, "If a file name not found, find a case insensitive match", "VFS")
flags.BoolVarP(flagSet, &Opt.BlockNormDupes, "vfs-block-norm-dupes", "", Opt.BlockNormDupes, "If duplicate filenames exist in the same directory (after normalization), log an error and hide the duplicates (may have a performance cost)", "VFS")
flags.DurationVarP(flagSet, &Opt.WriteWait, "vfs-write-wait", "", Opt.WriteWait, "Time to wait for in-sequence write before giving error", "VFS")
flags.DurationVarP(flagSet, &Opt.ReadWait, "vfs-read-wait", "", Opt.ReadWait, "Time to wait for in-sequence read before seeking", "VFS")
flags.DurationVarP(flagSet, &Opt.WriteBack, "vfs-write-back", "", Opt.WriteBack, "Time to writeback files after last use when using cache", "VFS")
Expand Down

0 comments on commit 77c8596

Please sign in to comment.