Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TC57] Unit test to verify deletion of rebuild_snap once rebuild finished #200

Merged
merged 5 commits into from Feb 19, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/sys/uzfs_zvol.h
Expand Up @@ -59,6 +59,14 @@ typedef struct zvol_rebuild_info {

/* peer replica cnt whose rebuild is done and failure */
uint16_t rebuild_failed_cnt;

/*
* does stale clone exist?
* If stale_clone_exist set to non-zero then timer thread will delete
* the clone and related_snapshot.
* rebuilding thread will set stale_clone_exist to 1.
*/
uint8_t stale_clone_exist;
} zvol_rebuild_info_t;

/*
Expand Down Expand Up @@ -113,6 +121,9 @@ typedef struct zvol_state zvol_state_t;
#define ZVOL_IS_REBUILDING_FAILED(zv) \
(zv->rebuild_info.zv_rebuild_status == ZVOL_REBUILDING_FAILED)

#define ZVOL_HAS_STALE_CLONE(zv) \
(zv->rebuild_info.stale_clone_exist)

extern int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
const char *rebuild_status_to_str(zvol_rebuild_status_t status);

Expand Down
2 changes: 1 addition & 1 deletion include/uzfs_rebuilding.h
Expand Up @@ -64,7 +64,7 @@ int uzfs_zvol_release_internal_clone(zvol_state_t *zv,
/*
* To remove all internal snapshots of a dataset
*/
int uzfs_destroy_internal_all_snap(zvol_state_t *zv);
int uzfs_destroy_all_internal_snapshots(zvol_state_t *zv);
boolean_t is_stale_clone(zvol_state_t *);

#ifdef __cplusplus
Expand Down
6 changes: 6 additions & 0 deletions include/zrepl_mgmt.h
Expand Up @@ -76,6 +76,7 @@ typedef struct inject_delay_s {
int downgraded_replica_rebuild_size_set;
int io_receiver_exit;
int helping_replica_rebuild_complete;
int rebuild_complete;
} inject_delay_t;

typedef struct inject_rebuild_error_s {
Expand Down Expand Up @@ -284,6 +285,11 @@ uzfs_zinfo_take_refcnt(zvol_info_t *zinfo)
atomic_inc_64(&zinfo->refcnt);
}

/*
* To remove the internal stale clone
*/
int uzfs_zinfo_destroy_stale_clone(zvol_info_t *zinfo);

/*
* ZAP key for io sequence number
*/
Expand Down
2 changes: 1 addition & 1 deletion lib/libzpool/uzfs_rebuilding.c
Expand Up @@ -399,7 +399,7 @@ uzfs_zvol_get_or_create_internal_clone(zvol_state_t *zv,
* on a dataset
*/
int
uzfs_destroy_internal_all_snap(zvol_state_t *zv)
uzfs_destroy_all_internal_snapshots(zvol_state_t *zv)
{
int ret;
char snapname[MAXNAMELEN];
Expand Down
64 changes: 62 additions & 2 deletions lib/libzpool/zrepl_mgmt.c
Expand Up @@ -537,6 +537,11 @@ uzfs_zvol_destroy_snapshot_clone(zvol_state_t *zv, zvol_state_t *snap_zv,
int ret1 = 0;
char *clonename;

if (snap_zv == NULL) {
VERIFY(clone_zv != NULL);
return (0);
}

clonename = kmem_asprintf("%s/%s_%s", spa_name(zv->zv_spa),
strchr(zv->zv_name, '/') + 1,
REBUILD_SNAPSHOT_CLONENAME);
Expand All @@ -545,12 +550,16 @@ uzfs_zvol_destroy_snapshot_clone(zvol_state_t *zv, zvol_state_t *snap_zv,
clone_zv->zv_name, clonename, zv->zv_name);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here as well, we are using clone_zv and snap_zv.. so, these two can't be NULL

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. In this function, there are no checks regarding non-null of clone and snap zv. Error checking should be added to this function. I will update this function with error checking of all zv.


/* Destroy clone's snapshot */
ret = uzfs_destroy_internal_all_snap(clone_zv);
ret = uzfs_destroy_all_internal_snapshots(clone_zv);
if (ret != 0) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should break if ret is nonzero. Any reason why we are not breaking from here?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant - return if ret is nonzero

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. it should return to caller if uzfs_destroy_all_internal_snapshots return non-zero.

LOG_ERR("Rebuild_clone snap destroy failed on:%s"
" with err:%d", clone_zv->zv_name, ret);
" with err:%d", zv->zv_name, ret);
}

/*
* We need to release the snapshot zv so that next hold
* on dataset doesn't fail
*/
uzfs_zvol_release_internal_clone(zv, snap_zv, clone_zv);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if clone_zv is NULL, there can be panic in this release_internal_clone.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all the callers of uzfs_zvol_release_internal_clone are setting snapshot_zv and clone_zv to NULL. So, release need to happen. Lets add this as comment for later usage.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even if all snapshots deletion fails, we need to release and exit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok


// try_clone_delete_again:
Expand Down Expand Up @@ -607,3 +616,54 @@ uzfs_zinfo_destroy_internal_clone(zvol_info_t *zinfo)
clone_zv);
return (ret);
}

/*
* This API is used to delete stale
* cloned volume and backing snapshot.
*/
int
uzfs_zinfo_destroy_stale_clone(zvol_info_t *zinfo)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if open_dataset or hold_dataset of clone fails, its better to retry next time by releasing snap_zv and returning error.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

{
int ret = 0;
char *clone_subname = NULL;
zvol_state_t *l_snap_zv = NULL, *l_clone_zv = NULL;
zvol_state_t *zv;

if (!zinfo->main_zv)
return (0);

zv = zinfo->main_zv;

ret = get_snapshot_zv(zv, REBUILD_SNAPSHOT_SNAPNAME,
&l_snap_zv, B_FALSE, B_TRUE);
if (ret != 0) {
LOG_ERR("Failed to get info about %s@%s",
zv->zv_name, REBUILD_SNAPSHOT_SNAPNAME);
return (ret);
}

clone_subname = kmem_asprintf("%s_%s", strchr(zv->zv_name, '/') + 1,
REBUILD_SNAPSHOT_CLONENAME);

ret = uzfs_open_dataset(zv->zv_spa, clone_subname, &l_clone_zv);
if (ret == 0) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should handle error here. Any reason why we are not handling error?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I will update this.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'else' of this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

/*
* If hold on clone dataset fails then we will
* try to delete the clone after sometime.
*/
ret = uzfs_hold_dataset(l_clone_zv);
if (ret != 0) {
LOG_ERR("Failed to hold clone: %d", ret);
uzfs_close_dataset(l_clone_zv);
uzfs_close_dataset(l_snap_zv);
return (ret);
}
}

if (!uzfs_zvol_destroy_snapshot_clone(zv, l_snap_zv, l_clone_zv))
zv->rebuild_info.stale_clone_exist = 0;

strfree(clone_subname);

return (ret);
}
22 changes: 18 additions & 4 deletions lib/libzrepl/data_conn.c
Expand Up @@ -894,12 +894,21 @@ uzfs_zvol_rebuild_dw_replica(void *arg)
close(sfd);
}

if (wquiesce)
uzfs_zinfo_destroy_internal_clone(zinfo);
#ifdef DEBUG
if (inject_error.delay.rebuild_complete == 1)
sleep(10);
#endif

if (wquiesce) {
if (uzfs_zinfo_destroy_internal_clone(zinfo)) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in else, better to set the stale_clone_exist to 0

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

mutex_enter(&zinfo->main_zv->rebuild_mtx);
zinfo->main_zv->rebuild_info.stale_clone_exist++;
mutex_exit(&zinfo->main_zv->rebuild_mtx);
}
}

/* Parent thread have taken refcount, drop it now */
uzfs_zinfo_drop_refcnt(zinfo);

zk_thread_exit();
}

Expand Down Expand Up @@ -963,6 +972,10 @@ uzfs_zvol_timer_thread(void)
next_check = now +
zinfo->update_ionum_interval;
}

if (ZVOL_HAS_STALE_CLONE(zinfo->main_zv)) {
uzfs_zinfo_destroy_stale_clone(zinfo);
}
} else if (uzfs_zvol_get_status(zinfo->main_zv) ==
ZVOL_STATUS_DEGRADED &&
zinfo->main_zv->zv_objset) {
Expand Down Expand Up @@ -1014,6 +1027,7 @@ uzfs_zvol_timer_thread(void)
node_next);
kmem_free(n_zinfo, sizeof (*n_zinfo));
}
zk_thread_exit();
}

/*
Expand Down Expand Up @@ -1515,7 +1529,6 @@ uzfs_zvol_rebuild_scanner(void *arg)
}
if (ZINFO_IS_DEGRADED(zinfo))
zv = zinfo->clone_zv;

rc = uzfs_zvol_create_internal_snapshot(zv,
&snap_zv, metadata.io_num);
if (rc != 0) {
Expand Down Expand Up @@ -1652,6 +1665,7 @@ reinitialize_zv_state(zvol_state_t *zv)

uzfs_zvol_set_status(zv, ZVOL_STATUS_DEGRADED);
uzfs_zvol_set_rebuild_status(zv, ZVOL_REBUILDING_INIT);
zv->rebuild_info.stale_clone_exist = 0;
}

/*
Expand Down
85 changes: 80 additions & 5 deletions tests/cbtest/gtest/test_uzfs.cc
Expand Up @@ -2059,6 +2059,85 @@ void verify_ios_from_two_replica(void *arg)
zk_thread_exit();
}

static int
check_if_snap_exist(zvol_state_t *zv, char *snap)
{
int ret;
char snapname[MAXNAMELEN];
objset_t *os;
uint64_t obj = 0, cookie = 0;

if (!zv || !zv->zv_objset)
return (-1);

os = zv->zv_objset;
while (1) {
dsl_pool_config_enter(spa_get_dsl(zv->zv_spa), FTAG);
ret = dmu_snapshot_list_next(os, sizeof (snapname) - 1,
snapname, &obj, &cookie, NULL);
dsl_pool_config_exit(spa_get_dsl(zv->zv_spa), FTAG);

if (ret) {
if (ret == ENOENT)
ret = 0;
break;
}
if (strcmp(snapname, snap) == 0) {
ret = 1;
break;
}
}

return (ret);
}

TEST(uZFSRebuild, TestRebuildSnapDeletion) {
int data_conn_fd1, data_conn_fd3;
rebuild_scanner = &uzfs_mock_rebuild_scanner_abrupt_conn_close;
dw_replica_fn = &uzfs_zvol_rebuild_dw_replica;
io_receiver = &uzfs_zvol_io_receiver;

zinfo->main_zv->zv_status = ZVOL_STATUS_DEGRADED;
zvol_rebuild_step_size = (1024ULL * 1024ULL * 1024ULL) / 2 + 1000;
do_data_connection(data_conn_fd1, "127.0.0.1", IO_SERVER_PORT, "vol1");
do_data_connection(data_conn_fd3, "127.0.0.1", IO_SERVER_PORT, "vol3");

uint64_t quorum = -1;
EXPECT_EQ(0, dsl_prop_get_integer(zinfo->main_zv->zv_name,
zfs_prop_to_name(ZFS_PROP_QUORUM), &quorum, NULL));
EXPECT_EQ(quorum, 0);

/* thread that helps rebuilding exits abruptly just after connects */
execute_rebuild_test_case("rebuild abrupt", 1, ZVOL_REBUILDING_SNAP,
ZVOL_REBUILDING_FAILED, 4, "vol3");
close(data_conn_fd1);

EXPECT_EQ(NULL, !zinfo->clone_zv);
EXPECT_EQ(0, dmu_objset_snapshot_one(zinfo->clone_zv->zv_name, ".io_snap100.2"));
EXPECT_EQ(0, dmu_objset_snapshot_one(zinfo->clone_zv->zv_name, ".io_snap100.1"));

rebuild_scanner = &uzfs_zvol_rebuild_scanner;
zinfo->main_zv->zv_status = ZVOL_STATUS_DEGRADED;
#ifdef DEBUG
inject_error.delay.rebuild_complete = 1;
#endif
do_data_connection(data_conn_fd1, "127.0.0.1", IO_SERVER_PORT, "vol1");
execute_rebuild_test_case("complete rebuild with data conn", 15,
ZVOL_REBUILDING_SNAP, ZVOL_REBUILDING_DONE, 4, "vol3");

sleep(12);
inject_error.delay.rebuild_complete = 0;

EXPECT_EQ(0, dsl_prop_get_integer(zinfo->main_zv->zv_name,
zfs_prop_to_name(ZFS_PROP_QUORUM), &quorum, NULL));
EXPECT_EQ(quorum, 1);

EXPECT_EQ(0, check_if_snap_exist(zinfo->main_zv, (char *)REBUILD_SNAPSHOT_SNAPNAME));
close(data_conn_fd1);
close(data_conn_fd3);
sleep(10);
}

TEST(uZFSRebuild, TestErroredRebuild) {
replica_writes_io_t wargs = { 0 };
kthread_t *writer_thread, *reader_thread;
Expand Down Expand Up @@ -2087,11 +2166,6 @@ TEST(uZFSRebuild, TestErroredRebuild) {
#ifdef DEBUG
inject_error.inject_rebuild_error.dw_replica_rebuild_error_io = (total_ios) / 4;
#endif
uint64_t quorum = -1;
EXPECT_EQ(0, dsl_prop_get_integer(zinfo->main_zv->zv_name,
zfs_prop_to_name(ZFS_PROP_QUORUM), &quorum, NULL));
EXPECT_EQ(quorum, 0);

execute_rebuild_test_case("errored rebuild with data conn", 15,
ZVOL_REBUILDING_SNAP, ZVOL_REBUILDING_FAILED, 4, "vol3");
close(wargs.r1_fd);
Expand Down Expand Up @@ -2130,6 +2204,7 @@ TEST(uZFSRebuild, TestErroredRebuild) {
0, 0);
zk_thread_join(writer_thread->t_tid);

uint64_t quorum = 0;
EXPECT_EQ(0, dsl_prop_get_integer(zinfo->main_zv->zv_name,
zfs_prop_to_name(ZFS_PROP_QUORUM), &quorum, NULL));
EXPECT_EQ(quorum, 1);
Expand Down