Skip to content

Commit

Permalink
ddt: lookup and log stats
Browse files Browse the repository at this point in the history
Adds per-DDT stats counting lookups and where they were serviced from
(either log or backing zap), number of log entries in memory, and flow
rates.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
  • Loading branch information
robn committed May 15, 2024
1 parent 4479c53 commit 25db08d
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 6 deletions.
2 changes: 2 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ typedef struct {

uint64_t ddt_flush_force_txg; /* flush hard before this txg */

kstat_t *ddt_ksp; /* kstats context */

enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */
Expand Down
163 changes: 157 additions & 6 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,78 @@ static const uint64_t ddt_version_flags[] = {
/* Dummy version to signal that configure is still necessary */
#define DDT_VERSION_UNCONFIGURED (UINT64_MAX)

#ifdef _KERNEL
/* per-DDT kstats */
typedef struct {
/* total lookups and whether they returned new or existing entries */
kstat_named_t dds_lookup;
kstat_named_t dds_lookup_new;
kstat_named_t dds_lookup_existing;

/* entries found on live tree, and if we had to wait for load */
kstat_named_t dds_lookup_live_hit;
kstat_named_t dds_lookup_live_wait;
kstat_named_t dds_lookup_live_miss;

/* entries found on log trees */
kstat_named_t dds_lookup_log_hit;
kstat_named_t dds_lookup_log_active_hit;
kstat_named_t dds_lookup_log_flushing_hit;
kstat_named_t dds_lookup_log_miss;

/* entries found on store objects */
kstat_named_t dds_lookup_stored_hit;
kstat_named_t dds_lookup_stored_miss;

/* number of entries on log trees */
kstat_named_t dds_log_active_entries;
kstat_named_t dds_log_flushing_entries;

/* avg updated/flushed entries per txg */
kstat_named_t dds_log_ingest_rate;
kstat_named_t dds_log_flush_rate;
kstat_named_t dds_log_flush_time_rate;
} ddt_kstats_t;

static const ddt_kstats_t ddt_kstats_template = {
{ "lookup", KSTAT_DATA_UINT64 },
{ "lookup_new", KSTAT_DATA_UINT64 },
{ "lookup_existing", KSTAT_DATA_UINT64 },
{ "lookup_live_hit", KSTAT_DATA_UINT64 },
{ "lookup_live_wait", KSTAT_DATA_UINT64 },
{ "lookup_live_miss", KSTAT_DATA_UINT64 },
{ "lookup_log_hit", KSTAT_DATA_UINT64 },
{ "lookup_log_active_hit", KSTAT_DATA_UINT64 },
{ "lookup_log_flushing_hit", KSTAT_DATA_UINT64 },
{ "lookup_log_miss", KSTAT_DATA_UINT64 },
{ "lookup_stored_hit", KSTAT_DATA_UINT64 },
{ "lookup_stored_miss", KSTAT_DATA_UINT64 },
{ "log_active_entries", KSTAT_DATA_UINT64 },
{ "log_flushing_entries", KSTAT_DATA_UINT64 },
{ "log_ingest_rate", KSTAT_DATA_UINT32 },
{ "log_flush_rate", KSTAT_DATA_UINT32 },
{ "log_flush_time_rate", KSTAT_DATA_UINT32 },
};

#define _DDT_KSTAT_STAT(ddt, stat) \
&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
#define DDT_KSTAT_BUMP(ddt, stat) \
do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0)
#define DDT_KSTAT_ADD(ddt, stat, val) \
do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
#define DDT_KSTAT_SUB(ddt, stat, val) \
do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
#define DDT_KSTAT_SET(ddt, stat, val) \
do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
#define DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
#else
#define DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
#define DDT_KSTAT_ADD(ddt, stat, val) do {} while (0)
#define DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
#define DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
#define DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
#endif /* _KERNEL */

static void
ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
dmu_tx_t *tx)
Expand Down Expand Up @@ -874,6 +946,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
}

DDT_KSTAT_BUMP(ddt, dds_lookup);

ddt_key_fill(&search, bp);

/* Find an existing live entry */
Expand All @@ -884,11 +958,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
return (NULL);

/* If it's already loaded, we can just return it. */
DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
if (dde->dde_flags & DDE_FLAG_LOADED)
return (dde);

/* Someone else is loading it, wait for it. */
dde->dde_waiters++;
DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
while (!(dde->dde_flags & DDE_FLAG_LOADED))
cv_wait(&dde->dde_cv, &ddt->ddt_lock);
dde->dde_waiters--;
Expand All @@ -902,8 +978,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
return (NULL);
}

DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
return (dde);
}
} else
DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);

/* Not found. */
if (!add)
Expand All @@ -916,20 +994,33 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
/* If its in the log tree, we can "load" it from there */
if (ddt->ddt_flags & DDT_FLAG_LOG) {
ddt_lightweight_entry_t ddlwe;
boolean_t found = B_FALSE;

if (ddt_log_take_key(ddt, ddt->ddt_log_active,
&search, &ddlwe) ||
ddt_log_take_key(ddt, ddt->ddt_log_flushing,
&search, &ddlwe)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
found = B_TRUE;
} else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
&search, &ddlwe)) {
DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
found = B_TRUE;
}

if (found) {
dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;

for (int p = 0; p < ddlwe.ddlwe_nphys; p++)
dde->dde_phys[p] = ddlwe.ddlwe_phys[p];
dde->dde_type = ddlwe.ddlwe_type;
dde->dde_class = ddlwe.ddlwe_class;

DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);

return (dde);
}

DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
}

/*
Expand Down Expand Up @@ -973,6 +1064,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
/* Flag cleanup required */
dde->dde_flags |= DDE_FLAG_OVERQUOTA;
} else if (error == 0) {
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
DDT_KSTAT_BUMP(ddt, dds_lookup_existing);

/*
* The histograms only track inactive (stored or logged) blocks.
* We've just put an entry onto the live list, so we need to
Expand All @@ -989,6 +1083,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
ddt_lightweight_entry_t ddlwe;
DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
} else {
DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
DDT_KSTAT_BUMP(ddt, dds_lookup_new);
}

/* Entry loaded, everyone can proceed now */
Expand Down Expand Up @@ -1217,6 +1314,30 @@ ddt_configure(ddt_t *ddt)
return (0);
}

static void
ddt_table_alloc_kstats(ddt_t *ddt)
{
#ifdef _KERNEL
char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
char *name = kmem_asprintf("ddt_stats_%s",
zio_checksum_table[ddt->ddt_checksum].ci_name);

ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
if (ddt->ddt_ksp != NULL) {
ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
ddt->ddt_ksp->ks_data = dds;
kstat_install(ddt->ddt_ksp);
}

kmem_strfree(name);
kmem_strfree(mod);
#else
(void) ddt;
#endif /* _KERNEL */
}

static ddt_t *
ddt_table_alloc(spa_t *spa, enum zio_checksum c)
{
Expand All @@ -1236,13 +1357,22 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
ddt->ddt_version = DDT_VERSION_UNCONFIGURED;

ddt_log_alloc(ddt);
ddt_table_alloc_kstats(ddt);

return (ddt);
}

static void
ddt_table_free(ddt_t *ddt)
{
#ifdef _KERNEL
if (ddt->ddt_ksp != NULL) {
kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
ddt->ddt_ksp->ks_data = NULL;
kstat_delete(ddt->ddt_ksp);
}
#endif /* _KERNEL */

ddt_log_free(ddt);
ASSERT0(avl_numnodes(&ddt->ddt_tree));
ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
Expand Down Expand Up @@ -1309,6 +1439,11 @@ ddt_load(spa_t *spa)
if (error != 0 && error != ENOENT)
return (error);

DDT_KSTAT_SET(ddt, dds_log_active_entries,
avl_numnodes(&ddt->ddt_log_active->ddl_tree));
DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));

/*
* Seed the cached histograms.
*/
Expand Down Expand Up @@ -1753,12 +1888,15 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)

if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
/* We emptied it, so truncate on-disk */
DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
ddt_log_truncate(ddt, tx);
/* No more passes needed this txg */
ddt->ddt_flush_pass = 0;
} else
} else {
/* More to do next time, save checkpoint */
DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
ddt_log_checkpoint(ddt, &ddlwe, tx);
}

ddt_sync_update_stats(ddt, tx);

Expand Down Expand Up @@ -1787,7 +1925,11 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
* No more to flush, and the active list has stuff, so
* try to swap the logs for next time.
*/
(void) ddt_log_swap(ddt, tx);
if (ddt_log_swap(ddt, tx)) {
DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
}
}

if (ddt->ddt_flush_force_txg > 0 &&
Expand All @@ -1803,6 +1945,7 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
ddt->ddt_log_flush_rate = _ewma(
ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
zfs_dedup_log_flush_flow_rate_txgs);
DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);

/*
* Update flush time rate. This is an exponential weighted moving
Expand All @@ -1812,6 +1955,8 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
ddt->ddt_log_flush_time_rate,
((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
zfs_dedup_log_flush_flow_rate_txgs);
DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
ddt->ddt_log_flush_time_rate);
}

static void
Expand All @@ -1837,6 +1982,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)

ddt_log_commit(ddt, &dlu);

DDT_KSTAT_SET(ddt, dds_log_active_entries,
avl_numnodes(&ddt->ddt_log_active->ddl_tree));

/*
* Sync the stats for the store objects. Even though we haven't
* modified anything on those objects, they're no longer the
Expand All @@ -1858,7 +2006,7 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
}

if (spa_sync_pass(ddt->ddt_spa) == 1)
if (spa_sync_pass(ddt->ddt_spa) == 1) {
/*
* Update ingest rate. This is an exponential weighted moving
* average of the number of entries changed over recent txgs.
Expand All @@ -1868,6 +2016,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
ddt->ddt_log_ingest_rate = _ewma(
count, ddt->ddt_log_ingest_rate,
zfs_dedup_log_flush_flow_rate_txgs);
DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
ddt->ddt_log_ingest_rate);
}
}

static void
Expand Down

0 comments on commit 25db08d

Please sign in to comment.