Skip to content

Commit

Permalink
Fix DML HA in multi-node
Browse files Browse the repository at this point in the history
If a datanode goes down for whatever reason then DML activity to
chunks residing on (or targeted to) that DN will start erroring out.
We now handle this by marking the target chunk as "stale" for this
DN by changing the metadata on the access node. This allows us to
continue to do DML to replicas of the same chunk data on other DNs
in the setup. This obviously will only work for chunks which have
"replication_factor" > 1. Note that for chunks which do not have
undergo any change will continue to carry the appropriate DN related
metadata on the AN.

This means that such "stale" chunks will become underreplicated and
need to be re-balanced by using the copy_chunk functionality by a micro
service or some such process.

Fixes timescale#4846
  • Loading branch information
nikkhils committed Nov 11, 2022
1 parent f132148 commit 69aea98
Show file tree
Hide file tree
Showing 19 changed files with 650 additions and 272 deletions.
8 changes: 7 additions & 1 deletion src/chunk.c
Expand Up @@ -1468,7 +1468,7 @@ ts_chunk_find_for_point(const Hypertable *ht, const Point *p)
* Create a chunk through insertion of a tuple at a given point.
*/
Chunk *
ts_chunk_create_for_point(const Hypertable *ht, const Point *p, const char *schema,
ts_chunk_create_for_point(const Hypertable *ht, const Point *p, bool *found, const char *schema,
const char *prefix)
{
/*
Expand Down Expand Up @@ -1499,6 +1499,8 @@ ts_chunk_create_for_point(const Hypertable *ht, const Point *p, const char *sche
* release the lock early.
*/
UnlockRelationOid(ht->main_table_relid, ShareUpdateExclusiveLock);
if (found)
*found = true;
return chunk;
}

Expand All @@ -1510,11 +1512,15 @@ ts_chunk_create_for_point(const Hypertable *ht, const Point *p, const char *sche
chunk = chunk_resurrect(ht, chunk_id);
if (chunk != NULL)
{
if (found)
*found = true;
return chunk;
}
}

/* Create the chunk normally. */
if (found)
*found = false;
if (hypertable_is_distributed_member(ht))
ereport(ERROR,
(errcode(ERRCODE_TS_INTERNAL_ERROR),
Expand Down
4 changes: 2 additions & 2 deletions src/chunk.h
Expand Up @@ -145,8 +145,8 @@ typedef struct DisplayKeyData

extern void ts_chunk_formdata_fill(FormData_chunk *fd, const TupleInfo *ti);
extern Chunk *ts_chunk_find_for_point(const Hypertable *ht, const Point *p);
extern Chunk *ts_chunk_create_for_point(const Hypertable *ht, const Point *p, const char *schema,
const char *prefix);
extern Chunk *ts_chunk_create_for_point(const Hypertable *ht, const Point *p, bool *found,
const char *schema, const char *prefix);
List *ts_chunk_id_find_in_subspace(Hypertable *ht, List *dimension_vecs);

extern TSDLLEXPORT Chunk *ts_chunk_create_base(int32 id, int16 num_constraints, const char relkind);
Expand Down
3 changes: 2 additions & 1 deletion src/hypertable.c
Expand Up @@ -1067,12 +1067,13 @@ hypertable_chunk_store_add(const Hypertable *h, const Chunk *input_chunk)
* Create a chunk for the point, given that it does not exist yet.
*/
Chunk *
ts_hypertable_create_chunk_for_point(const Hypertable *h, const Point *point)
ts_hypertable_create_chunk_for_point(const Hypertable *h, const Point *point, bool *found)
{
Assert(ts_subspace_store_get(h->chunk_cache, point) == NULL);

Chunk *chunk = ts_chunk_create_for_point(h,
point,
found,
NameStr(h->fd.associated_schema_name),
NameStr(h->fd.associated_table_prefix));

Expand Down
2 changes: 1 addition & 1 deletion src/hypertable.h
Expand Up @@ -135,7 +135,7 @@ extern TSDLLEXPORT int32 ts_hypertable_relid_to_id(Oid relid);
extern TSDLLEXPORT Chunk *ts_hypertable_find_chunk_for_point(const Hypertable *h,
const Point *point);
extern TSDLLEXPORT Chunk *ts_hypertable_create_chunk_for_point(const Hypertable *h,
const Point *point);
const Point *point, bool *found);
extern Oid ts_hypertable_relid(RangeVar *rv);
extern TSDLLEXPORT bool ts_is_hypertable(Oid relid);
extern bool ts_hypertable_has_tablespace(const Hypertable *ht, Oid tspc_oid);
Expand Down
61 changes: 60 additions & 1 deletion src/nodes/chunk_dispatch.c
Expand Up @@ -14,9 +14,11 @@
#include "compat/compat.h"
#include "chunk_dispatch.h"
#include "chunk_insert_state.h"
#include "errors.h"
#include "subspace_store.h"
#include "dimension.h"
#include "guc.h"
#include "ts_catalog/chunk_data_node.h"

ChunkDispatch *
ts_chunk_dispatch_create(Hypertable *ht, EState *estate, int eflags)
Expand Down Expand Up @@ -144,10 +146,67 @@ ts_chunk_dispatch_get_chunk_insert_state(ChunkDispatch *dispatch, Point *point,
* locking the hypertable. This serves as a fast path for the usual case
* where the chunk already exists.
*/
bool found;
Chunk *new_chunk = ts_hypertable_find_chunk_for_point(dispatch->hypertable, point);
if (new_chunk == NULL)
{
new_chunk = ts_hypertable_create_chunk_for_point(dispatch->hypertable, point);
new_chunk = ts_hypertable_create_chunk_for_point(dispatch->hypertable, point, &found);
}
else
found = true;

/* get the filtered list of "available" DNs for this chunk */
List *chunk_data_nodes =
ts_chunk_data_node_scan_by_chunk_id_filter(new_chunk->fd.id, CurrentMemoryContext);

/*
* Check if the chunk is under_replicated and has less DNs assigned to it as compared
* to the hypertable's replication_factor value.
*/
if (found && dispatch->hypertable->fd.replication_factor > list_length(chunk_data_nodes))
{
List *serveroids = NIL;
ListCell *lc;
ChunkDataNode *cdn;

/* check that alteast one data node is available for this chunk on the AN */
if (chunk_data_nodes == NIL)
ereport(ERROR,
(errcode(ERRCODE_TS_INSUFFICIENT_NUM_DATA_NODES),
(errmsg("insufficient number of data nodes"),
errhint("Increase the number of available data nodes on hypertable "
"\"%s\".",
get_rel_name(dispatch->hypertable->main_table_relid)))));

foreach (lc, chunk_data_nodes)
{
cdn = lfirst(lc);
serveroids = lappend_oid(serveroids, cdn->foreign_server_oid);
}

foreach (lc, new_chunk->data_nodes)
{
cdn = lfirst(lc);
/*
* check if this DN is a part of chunk_data_nodes. If not
* found in chunk_data_nodes, then we need to remove this
* chunk id to node name mapping and also update the primary
* foreign server if necessary. It's possible that this metadata
* might have been already cleared earlier but we have no way of
* knowing that here.
*/
if (!list_member_oid(serveroids, cdn->foreign_server_oid))
{
ts_chunk_update_foreign_server_if_needed(new_chunk,
cdn->foreign_server_oid,
false);
ts_chunk_data_node_delete_by_chunk_id_and_node_name(cdn->fd.chunk_id,
NameStr(cdn->fd.node_name));
}
}

/* update new_chunk->data_nodes to point to the list of "live" DNs */
new_chunk->data_nodes = chunk_data_nodes;
}

if (NULL == new_chunk)
Expand Down

0 comments on commit 69aea98

Please sign in to comment.