Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

curvefs: add a new distributed transaction model to improve rename performance #2884

Merged
merged 2 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions curvefs/conf/client.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ fuseClient.getThreadPool=4
# it gurantee the consistent of file after rename, otherwise you should
# disable it for performance.
fuseClient.enableMultiMountPointRename=true

# the rename transaction models are different between version 1 and version 2
# the v2 version greatly improves the performance of rename, especially in concurrent scenarios.
# Node: v1 and v2 are incompatible and cannot be directly upgraded from a v1 cluster to v2.
fuseClient.txVersion=1

# splice will bring higher performance in some cases
# but there might be a kernel issue that will cause kernel panic when enabling it
# see https://lore.kernel.org/all/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/
Expand Down
10 changes: 9 additions & 1 deletion curvefs/conf/metaserver.conf
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,17 @@ storage.rocksdb.unordered_write_buffer_size=67108864
# for store inode which exclude its s3chunkinfo list (default: 3)
storage.rocksdb.unordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store dentry and inode's s3chunkinfo list (unit: bytes, default: 128MB)
# for store dentry and inode's s3chunkinfo list (unit: bytes, default: 64MB)
storage.rocksdb.ordered_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store dentry and inode's s3chunkinfo list (default: 3)
storage.rocksdb.ordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store tx lock and write (unit: bytes, default: 64MB)
storage.rocksdb.tx_cf_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store tx lock and write (default: 3)
storage.rocksdb.tx_cf_max_write_buffer_number=3
# The target number of write history bytes to hold in memory (default: 20MB)
storage.rocksdb.max_write_buffer_size_to_maintain=20971520
# rocksdb memtable prefix bloom size ratio (size=write_buffer_size*memtable_prefix_bloom_size_ratio)
Expand All @@ -286,6 +292,8 @@ storage.rocksdb.perf_sampling_ratio=0
# we will sending its with rpc streaming instead of
# padding its into inode (default: 25000, about 25000 * 41 (byte) = 1MB)
storage.s3_meta_inside_inode.limit_size=25000
# TTL(millisecond) for tx lock
storage.tx_lock_ttl_ms=5000

# recycle options
# metaserver scan recycle period, default 1h
Expand Down
2 changes: 1 addition & 1 deletion curvefs/proto/common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ message PartitionInfo {
// partition manage inodeid range [start, end]
required uint64 start = 5;
required uint64 end = 6;
required uint64 txId = 7;
optional uint64 txId = 7;
optional uint64 nextId = 8;
// status can change from READWRITE to READONLY, but can not chanage from READONLY to READWRITE
// READWRITE/READONLY can change to DELETING, but DELETING can not change to READWRITE/READONLY
Expand Down
10 changes: 10 additions & 0 deletions curvefs/proto/mds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,14 @@ message SetClientMdsAddrsOverrideResponse {
required FSStatusCode statusCode = 1;
}

message TsoRequest {}

message TsoResponse {
required FSStatusCode statusCode = 1;
optional uint64 ts = 2; // transaction sequence number
optional uint64 timestamp = 3;
}

service MdsService {
// fs interface
rpc CreateFs(CreateFsRequest) returns (CreateFsResponse);
Expand All @@ -259,6 +267,8 @@ service MdsService {
rpc GetLatestTxId(GetLatestTxIdRequest) returns (GetLatestTxIdResponse);
rpc CommitTx(CommitTxRequest) returns (CommitTxResponse);

rpc Tso(TsoRequest) returns (TsoResponse);

// client lease
rpc RefreshSession(RefreshSessionRequest) returns (RefreshSessionResponse);

Expand Down
106 changes: 102 additions & 4 deletions curvefs/proto/metaserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ enum MetaStatusCode {
RPC_STREAM_ERROR = 25;
INODE_S3_META_TOO_LARGE = 26;
STORAGE_CLOSED = 27;
// tx v2 related
TX_FAILED = 28;
TX_WRITE_CONFLICT = 29;
TX_KEY_LOCKED = 30;
TX_COMMITTED = 31;
TX_ROLLBACKED = 32;
TX_TIMEOUT = 33;
TX_INPROGRESS = 34;
TX_MISMATCH = 35;
}

// dentry interface
Expand All @@ -59,7 +68,7 @@ message GetDentryRequest {
required uint32 fsId = 4;
required uint64 parentInodeId = 5;
required string name = 6;
required uint64 txId = 7;
optional uint64 txId = 7;
optional uint64 appliedIndex = 8;
}

Expand All @@ -74,7 +83,8 @@ message Dentry {
required uint64 inodeId = 2;
required uint64 parentInodeId = 3;
required string name = 4;
required uint64 txId = 5;
// reused txId as ts in tx v2 for compatibility in metaserver
optional uint64 txId = 5;
optional uint32 flag = 6;
optional FsFileType type = 7;
optional uint64 txSequence = 8;
Expand All @@ -88,6 +98,7 @@ message GetDentryResponse {
required MetaStatusCode statusCode = 1;
optional Dentry dentry = 2;
optional uint64 appliedIndex = 3;
optional TxLock txLock = 4;
}

message ListDentryRequest {
Expand All @@ -96,7 +107,7 @@ message ListDentryRequest {
required uint32 partitionId = 3;
required uint32 fsId = 4;
required uint64 dirInodeId = 5;
required uint64 txId = 6;
optional uint64 txId = 6;
optional string last = 7; // the name of last entry
optional uint32 count = 8; // the number of entry required
optional bool onlyDir = 9;
Expand All @@ -107,6 +118,7 @@ message ListDentryResponse {
required MetaStatusCode statusCode = 1;
repeated Dentry dentrys = 2;
optional uint64 appliedIndex = 3;
optional TxLock txLock = 4;
}

message CreateDentryRequest {
Expand All @@ -120,14 +132,15 @@ message CreateDentryRequest {
message CreateDentryResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
optional TxLock txLock = 3;
}

message DeleteDentryRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required uint32 fsId = 4;
required uint64 txId = 5;
optional uint64 txId = 5;
required uint64 parentInodeId = 6;
required string name = 7;
optional FsFileType type = 8;
Expand All @@ -137,6 +150,7 @@ message DeleteDentryRequest {
message DeleteDentryResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
optional TxLock txLock = 3;
}

message PrepareRenameTxRequest {
Expand All @@ -160,6 +174,85 @@ message PrepareRenameTxResponse {
optional uint64 appliedIndex = 2;
}

message TxLock {
required string primaryKey = 1;
required uint64 startTs = 2;
required uint64 timestamp = 3;
optional uint32 index = 4;
optional int32 ttl = 5;
}

enum TxWriteKind {
Commit = 1;
Rollback = 2;
}

message TS {
required uint64 ts = 1;
}

message TxWrite {
required uint64 startTs = 1;
required TxWriteKind kind = 2;
}

message PrewriteRenameTxRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
repeated Dentry dentrys = 4;
required TxLock txLock = 5;
}

message PrewriteRenameTxResponse {
required MetaStatusCode statusCode = 1;
repeated Dentry dentrys = 2;
optional TxLock txLock = 3;
optional uint64 appliedIndex = 4;
}

message CheckTxStatusRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required string primaryKey = 4;
required uint64 startTs = 5;
required uint64 curTimestamp = 6;
}

message CheckTxStatusResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

message ResolveTxLockRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required Dentry dentry = 4;
required uint64 startTs = 5;
required uint64 commitTs = 6;
}

message ResolveTxLockResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

message CommitTxRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
repeated Dentry dentrys = 4;
required uint64 startTs = 5;
required uint64 commitTs = 6;
}

message CommitTxResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

// inode interface
message GetInodeRequest {
required uint32 poolId = 1;
Expand Down Expand Up @@ -539,6 +632,11 @@ service MetaServerService {
rpc CreateDentry(CreateDentryRequest) returns (CreateDentryResponse);
rpc DeleteDentry(DeleteDentryRequest) returns (DeleteDentryResponse);
rpc PrepareRenameTx(PrepareRenameTxRequest) returns (PrepareRenameTxResponse);
// tx v2
rpc PrewriteRenameTx(PrewriteRenameTxRequest) returns (PrewriteRenameTxResponse);
rpc CheckTxStatus(CheckTxStatusRequest) returns (CheckTxStatusResponse);
rpc ResolveTxLock(ResolveTxLockRequest) returns (ResolveTxLockResponse);
rpc CommitTx(CommitTxRequest) returns (CommitTxResponse);

// inode interface
rpc GetInode(GetInodeRequest) returns (GetInodeResponse);
Expand Down
1 change: 0 additions & 1 deletion curvefs/proto/topology.proto
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ message ClusterInfoData {
required string clusterId = 1;
// <fsId, partitionIndex of this fs>
map<uint32, uint32> partitionIndexs = 2;

}

message PoolData {
Expand Down
1 change: 1 addition & 0 deletions curvefs/src/client/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ cc_library(
"//curvefs/src/common:metric_utils",
"//curvefs/src/common:dynamic_vlog",
"//curvefs/src/common:threading",
"//curvefs/src/metaserver:metaserver_storage_conv",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/synchronization",
Expand Down
Loading