/
MergeStrategy.json
66 lines (66 loc) · 3.44 KB
/
MergeStrategy.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
{
"$id": "http://open-data-fabric.github.com/schemas/MergeStrategy",
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Merge strategy determines how newly ingested data should be combined with the data that already exists in the dataset.",
"$defs": {
"Append": {
"description": "Append merge strategy.\n\nUnder this strategy new data will be appended to the dataset in its entirety, without any deduplication.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {}
},
"Ledger": {
"description": "Ledger merge strategy.\n\nThis strategy should be used for data sources containing ledgers of events. Currently this strategy will only perform deduplication of events using user-specified primary key columns. This means that the source data can contain partially overlapping set of records and only those records that were not previously seen will be appended.",
"type": "object",
"additionalProperties": false,
"required": [
"primaryKey"
],
"properties": {
"primaryKey": {
"type": "array",
"items": {
"type": "string"
},
"description": "Names of the columns that uniquely identify the record throughout its lifetime"
}
}
},
"Snapshot": {
"description": "Snapshot merge strategy.\n\nThis strategy can be used for data state snapshots that are taken periodically and contain only the latest state of the observed entity or system. Over time such snapshots can have new rows added, and old rows either removed or modified.\n\nThis strategy transforms snapshot data into an append-only event stream where data already added is immutable. It does so by performing Change Data Capture - essentially diffing the current state of data against the reconstructed previous state and recording differences as retractions or corrections. The Operation Type \"op\" column will contain:\n - append (`+A`) when a row appears for the first time\n - retraction (`-D`) when row disappears\n - correction (`-C`, `+C`) when row data has changed, with `-C` event carrying the old value of the row and `+C` carrying the new value.\n\nTo correctly associate rows between old and new snapshots this strategy relies on user-specified primary key columns.\n\nTo identify whether a row has changed this strategy will compare all other columns one by one. If the data contains a column that is guaranteed to change whenever any of the data columns changes (for example a last modification timestamp, an incremental version, or a data hash), then it can be specified in `compareColumns` property to speed up the detection of modified rows.",
"type": "object",
"additionalProperties": false,
"required": [
"primaryKey"
],
"properties": {
"primaryKey": {
"type": "array",
"items": {
"type": "string"
},
"description": "Names of the columns that uniquely identify the record throughout its lifetime."
},
"compareColumns": {
"type": "array",
"items": {
"type": "string"
},
"description": "Names of the columns to compared to determine if a row has changed between two snapshots."
}
}
}
},
"oneOf": [
{
"$ref": "#/$defs/Append"
},
{
"$ref": "#/$defs/Ledger"
},
{
"$ref": "#/$defs/Snapshot"
}
]
}