/
ReadStep.json
225 lines (225 loc) · 7.85 KB
/
ReadStep.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
{
"$id": "http://open-data-fabric.github.com/schemas/ReadStep",
"$schema": "http://json-schema.org/draft-07/schema#",
"description": "Defines how raw data should be read into the structured form.",
"$defs": {
"Csv": {
"description": "Reader for comma-separated files.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types.",
"examples": [
[
"date TIMESTAMP",
"city STRING",
"population INT"
]
]
},
"separator": {
"type": "string",
"description": "Sets a single character as a separator for each field and value.",
"default": ","
},
"encoding": {
"type": "string",
"description": "Decodes the CSV files by the given encoding type.",
"default": "utf8"
},
"quote": {
"type": "string",
"description": "Sets a single character used for escaping quoted values where the separator can be part of the value. Set an empty string to turn off quotations.",
"default": "\""
},
"escape": {
"type": "string",
"description": "Sets a single character used for escaping quotes inside an already quoted value.",
"default": "\\"
},
"header": {
"type": "boolean",
"description": "Use the first line as names of columns.",
"default": false
},
"inferSchema": {
"type": "boolean",
"description": "Infers the input schema automatically from data. It requires one extra pass over the data.",
"default": false
},
"nullValue": {
"type": "string",
"description": "Sets the string representation of a null value.",
"default": ""
},
"dateFormat": {
"type": "string",
"description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
},
"timestampFormat": {
"type": "string",
"description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
}
}
},
"Json": {
"description": "Reader for JSON files that contain an array of objects within them.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"subPath": {
"type": "string",
"description": "Path in the form of `a.b.c` to a sub-element of the root JSON object that is an array or objects. If not specified it is assumed that the root element is an array."
},
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
},
"dateFormat": {
"type": "string",
"description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
},
"encoding": {
"type": "string",
"description": "Allows to forcibly set one of standard basic or extended encodings.",
"default": "utf8"
},
"timestampFormat": {
"type": "string",
"description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
}
}
},
"NdJson": {
"description": "Reader for files containing multiple newline-delimited JSON objects with the same schema.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
},
"dateFormat": {
"type": "string",
"description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
},
"encoding": {
"type": "string",
"description": "Allows to forcibly set one of standard basic or extended encodings.",
"default": "utf8"
},
"timestampFormat": {
"type": "string",
"description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
"default": "rfc3339"
}
}
},
"GeoJson": {
"description": "Reader for GeoJSON files. It expects one `FeatureCollection` object in the root and will create a record per each `Feature` inside it extracting the properties into individual columns and leaving the feature geometry in its own column.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
}
}
},
"NdGeoJson": {
"description": "Reader for Newline-delimited GeoJSON files. It is similar to `GeoJson` format but instead of `FeatureCollection` object in the root it expects every individual feature object to appear on its own line.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
}
}
},
"EsriShapefile": {
"description": "Reader for ESRI Shapefile format.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
},
"subPath": {
"type": "string",
"description": "If the ZIP archive contains multiple shapefiles use this field to specify a sub-path to the desired `.shp` file. Can contain glob patterns to act as a filter."
}
}
},
"Parquet": {
"description": "Reader for Apache Parquet format.",
"type": "object",
"additionalProperties": false,
"required": [],
"properties": {
"schema": {
"type": "array",
"items": {
"type": "string"
},
"description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
}
}
}
},
"oneOf": [
{
"$ref": "#/$defs/Csv"
},
{
"$ref": "#/$defs/GeoJson"
},
{
"$ref": "#/$defs/EsriShapefile"
},
{
"$ref": "#/$defs/Parquet"
},
{
"$ref": "#/$defs/Json"
},
{
"$ref": "#/$defs/NdJson"
},
{
"$ref": "#/$defs/NdGeoJson"
}
]
}