Skip to content

Commit

Permalink
curvefs: add a new distributed transaction model to improve rename pe…
Browse files Browse the repository at this point in the history
…rformance

Signed-off-by: wanghai01 <seanhaizi@163.com>
  • Loading branch information
SeanHai committed Nov 30, 2023
1 parent a2f8403 commit a920cd9
Show file tree
Hide file tree
Showing 93 changed files with 4,370 additions and 1,293 deletions.
6 changes: 6 additions & 0 deletions curvefs/conf/client.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ fuseClient.getThreadPool=4
# it gurantee the consistent of file after rename, otherwise you should
# disable it for performance.
fuseClient.enableMultiMountPointRename=true

# the rename transaction models are different between version 1 and version 2
# the v2 version greatly improves the performance of rename, especially in concurrent scenarios.
# Node: v1 and v2 are incompatible and cannot be directly upgraded from a v1 cluster to v2.
fuseClient.txVersion=1

# splice will bring higher performance in some cases
# but there might be a kernel issue that will cause kernel panic when enabling it
# see https://lore.kernel.org/all/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/
Expand Down
10 changes: 9 additions & 1 deletion curvefs/conf/metaserver.conf
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,17 @@ storage.rocksdb.unordered_write_buffer_size=67108864
# for store inode which exclude its s3chunkinfo list (default: 3)
storage.rocksdb.unordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store dentry and inode's s3chunkinfo list (unit: bytes, default: 128MB)
# for store dentry and inode's s3chunkinfo list (unit: bytes, default: 64MB)
storage.rocksdb.ordered_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store dentry and inode's s3chunkinfo list (default: 3)
storage.rocksdb.ordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store tx lock and write (unit: bytes, default: 64MB)
storage.rocksdb.tx_cf_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store tx lock and write (default: 3)
storage.rocksdb.tx_cf_max_write_buffer_number=3
# The target number of write history bytes to hold in memory (default: 20MB)
storage.rocksdb.max_write_buffer_size_to_maintain=20971520
# rocksdb memtable prefix bloom size ratio (size=write_buffer_size*memtable_prefix_bloom_size_ratio)
Expand All @@ -286,6 +292,8 @@ storage.rocksdb.perf_sampling_ratio=0
# we will sending its with rpc streaming instead of
# padding its into inode (default: 25000, about 25000 * 41 (byte) = 1MB)
storage.s3_meta_inside_inode.limit_size=25000
# TTL(millisecond) for tx lock
storage.rocksdb.tx_lock_ttl_ms=5000

# recycle options
# metaserver scan recycle period, default 1h
Expand Down
2 changes: 1 addition & 1 deletion curvefs/proto/common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ message PartitionInfo {
// partition manage inodeid range [start, end]
required uint64 start = 5;
required uint64 end = 6;
required uint64 txId = 7;
optional uint64 txId = 7;
optional uint64 nextId = 8;
// status can change from READWRITE to READONLY, but can not chanage from READONLY to READWRITE
// READWRITE/READONLY can change to DELETING, but DELETING can not change to READWRITE/READONLY
Expand Down
10 changes: 10 additions & 0 deletions curvefs/proto/mds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,14 @@ message CommitTxResponse {
required FSStatusCode statusCode = 1;
}

message TsoRequest {}

message TsoResponse {
required FSStatusCode statusCode = 1;
optional uint64 sn = 2;
optional uint64 timestamp = 3;
}

service MdsService {
// fs interface
rpc CreateFs(CreateFsRequest) returns (CreateFsResponse);
Expand All @@ -249,6 +257,8 @@ service MdsService {
rpc GetLatestTxId(GetLatestTxIdRequest) returns (GetLatestTxIdResponse);
rpc CommitTx(CommitTxRequest) returns (CommitTxResponse);

rpc Tso(TsoRequest) returns (TsoResponse);

// client lease
rpc RefreshSession(RefreshSessionRequest) returns (RefreshSessionResponse);
}
106 changes: 102 additions & 4 deletions curvefs/proto/metaserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ enum MetaStatusCode {
RPC_STREAM_ERROR = 25;
INODE_S3_META_TOO_LARGE = 26;
STORAGE_CLOSED = 27;
// tx v2 related
TX_FAILED = 28;
TX_WRITE_CONFLICT = 29;
TX_KEY_LOCKED = 30;
TX_COMMITTED = 31;
TX_ROLLBACKED = 32;
TX_TIMEOUT = 33;
TX_INPROGRESS = 34;
TX_MISMATCH = 35;
}

// dentry interface
Expand All @@ -59,7 +68,7 @@ message GetDentryRequest {
required uint32 fsId = 4;
required uint64 parentInodeId = 5;
required string name = 6;
required uint64 txId = 7;
optional uint64 txId = 7;
optional uint64 appliedIndex = 8;
}

Expand All @@ -74,7 +83,8 @@ message Dentry {
required uint64 inodeId = 2;
required uint64 parentInodeId = 3;
required string name = 4;
required uint64 txId = 5;
// reused txId as ts in tx v2 for compatibility in metaserver
optional uint64 txId = 5;
optional uint32 flag = 6;
optional FsFileType type = 7;
optional uint64 txSequence = 8;
Expand All @@ -88,6 +98,7 @@ message GetDentryResponse {
required MetaStatusCode statusCode = 1;
optional Dentry dentry = 2;
optional uint64 appliedIndex = 3;
optional TxLock txLock = 4;
}

message ListDentryRequest {
Expand All @@ -96,7 +107,7 @@ message ListDentryRequest {
required uint32 partitionId = 3;
required uint32 fsId = 4;
required uint64 dirInodeId = 5;
required uint64 txId = 6;
optional uint64 txId = 6;
optional string last = 7; // the name of last entry
optional uint32 count = 8; // the number of entry required
optional bool onlyDir = 9;
Expand All @@ -107,6 +118,7 @@ message ListDentryResponse {
required MetaStatusCode statusCode = 1;
repeated Dentry dentrys = 2;
optional uint64 appliedIndex = 3;
optional TxLock txLock = 4;
}

message CreateDentryRequest {
Expand All @@ -120,14 +132,15 @@ message CreateDentryRequest {
message CreateDentryResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
optional TxLock txLock = 3;
}

message DeleteDentryRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required uint32 fsId = 4;
required uint64 txId = 5;
optional uint64 txId = 5;
required uint64 parentInodeId = 6;
required string name = 7;
optional FsFileType type = 8;
Expand All @@ -137,6 +150,7 @@ message DeleteDentryRequest {
message DeleteDentryResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
optional TxLock txLock = 3;
}

message PrepareRenameTxRequest {
Expand All @@ -160,6 +174,85 @@ message PrepareRenameTxResponse {
optional uint64 appliedIndex = 2;
}

message TxLock {
required string primaryKey = 1;
required uint64 startTs = 2;
required uint64 timestamp = 3;
optional uint32 index = 4;
optional int32 ttl = 5;
}

enum TxWriteKind {
Commit = 1;
Rollback = 2;
}

message TS {
required uint64 ts = 1;
}

message TxWrite {
required uint64 startTs = 1;
required TxWriteKind kind = 2;
}

message PrewriteRenameTxRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
repeated Dentry dentrys = 4;
required TxLock txLock = 5;
}

message PrewriteRenameTxResponse {
required MetaStatusCode statusCode = 1;
repeated Dentry dentrys = 2;
optional TxLock txLock = 3;
optional uint64 appliedIndex = 4;
}

message CheckTxStatusRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required string primaryKey = 4;
required uint64 startTs = 5;
required uint64 curTimestamp = 6;
}

message CheckTxStatusResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

message ResolveTxLockRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
required Dentry dentry = 4;
required uint64 startTs = 5;
required uint64 commitTs = 6;
}

message ResolveTxLockResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

message CommitTxRequest {
required uint32 poolId = 1;
required uint32 copysetId = 2;
required uint32 partitionId = 3;
repeated Dentry dentrys = 4;
required uint64 startTs = 5;
required uint64 commitTs = 6;
}

message CommitTxResponse {
required MetaStatusCode statusCode = 1;
optional uint64 appliedIndex = 2;
}

// inode interface
message GetInodeRequest {
required uint32 poolId = 1;
Expand Down Expand Up @@ -539,6 +632,11 @@ service MetaServerService {
rpc CreateDentry(CreateDentryRequest) returns (CreateDentryResponse);
rpc DeleteDentry(DeleteDentryRequest) returns (DeleteDentryResponse);
rpc PrepareRenameTx(PrepareRenameTxRequest) returns (PrepareRenameTxResponse);
// tx v2
rpc PrewriteRenameTx(PrewriteRenameTxRequest) returns (PrewriteRenameTxResponse);
rpc CheckTxStatus(CheckTxStatusRequest) returns (CheckTxStatusResponse);
rpc ResolveTxLock(ResolveTxLockRequest) returns (ResolveTxLockResponse);
rpc CommitTx(CommitTxRequest) returns (CommitTxResponse);

// inode interface
rpc GetInode(GetInodeRequest) returns (GetInodeResponse);
Expand Down
1 change: 0 additions & 1 deletion curvefs/proto/topology.proto
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ message ClusterInfoData {
required string clusterId = 1;
// <fsId, partitionIndex of this fs>
map<uint32, uint32> partitionIndexs = 2;

}

message PoolData {
Expand Down
1 change: 1 addition & 0 deletions curvefs/src/client/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ cc_library(
"//curvefs/src/common:metric_utils",
"//curvefs/src/common:dynamic_vlog",
"//curvefs/src/common:threading",
"//curvefs/src/metaserver:metaserver_storage_conv",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/synchronization",
Expand Down
Loading

0 comments on commit a920cd9

Please sign in to comment.