schemas/fragments/ReadStep.json

{
  "$id": "http://open-data-fabric.github.com/schemas/ReadStep",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "description": "Defines how raw data should be read into the structured form.",
  "$defs": {
    "Csv": {
      "description": "Reader for comma-separated files.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types.",
          "examples": [
            [
              "date TIMESTAMP",
              "city STRING",
              "population INT"
            ]
          ]
        },
        "separator": {
          "type": "string",
          "description": "Sets a single character as a separator for each field and value.",
          "default": ","
        },
        "encoding": {
          "type": "string",
          "description": "Decodes the CSV files by the given encoding type.",
          "default": "utf8"
        },
        "quote": {
          "type": "string",
          "description": "Sets a single character used for escaping quoted values where the separator can be part of the value. Set an empty string to turn off quotations.",
          "default": "\""
        },
        "escape": {
          "type": "string",
          "description": "Sets a single character used for escaping quotes inside an already quoted value.",
          "default": "\\"
        },
        "header": {
          "type": "boolean",
          "description": "Use the first line as names of columns.",
          "default": false
        },
        "inferSchema": {
          "type": "boolean",
          "description": "Infers the input schema automatically from data. It requires one extra pass over the data.",
          "default": false
        },
        "nullValue": {
          "type": "string",
          "description": "Sets the string representation of a null value.",
          "default": ""
        },
        "dateFormat": {
          "type": "string",
          "description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        },
        "timestampFormat": {
          "type": "string",
          "description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        }
      }
    },
    "Json": {
      "description": "Reader for JSON files that contain an array of objects within them.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "subPath": {
          "type": "string",
          "description": "Path in the form of `a.b.c` to a sub-element of the root JSON object that is an array or objects. If not specified it is assumed that the root element is an array."
        },
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        },
        "dateFormat": {
          "type": "string",
          "description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        },
        "encoding": {
          "type": "string",
          "description": "Allows to forcibly set one of standard basic or extended encodings.",
          "default": "utf8"
        },
        "timestampFormat": {
          "type": "string",
          "description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        }
      }
    },
    "NdJson": {
      "description": "Reader for files containing multiple newline-delimited JSON objects with the same schema.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        },
        "dateFormat": {
          "type": "string",
          "description": "Sets the string that indicates a date format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        },
        "encoding": {
          "type": "string",
          "description": "Allows to forcibly set one of standard basic or extended encodings.",
          "default": "utf8"
        },
        "timestampFormat": {
          "type": "string",
          "description": "Sets the string that indicates a timestamp format. The `rfc3339` is the only required format, the other format strings are implementation-specific.",
          "default": "rfc3339"
        }
      }
    },
    "GeoJson": {
      "description": "Reader for GeoJSON files. It expects one `FeatureCollection` object in the root and will create a record per each `Feature` inside it extracting the properties into individual columns and leaving the feature geometry in its own column.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        }
      }
    },
    "NdGeoJson": {
      "description": "Reader for Newline-delimited GeoJSON files. It is similar to `GeoJson` format but instead of `FeatureCollection` object in the root it expects every individual feature object to appear on its own line.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        }
      }
    },
    "EsriShapefile": {
      "description": "Reader for ESRI Shapefile format.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        },
        "subPath": {
          "type": "string",
          "description": "If the ZIP archive contains multiple shapefiles use this field to specify a sub-path to the desired `.shp` file. Can contain glob patterns to act as a filter."
        }
      }
    },
    "Parquet": {
      "description": "Reader for Apache Parquet format.",
      "type": "object",
      "additionalProperties": false,
      "required": [],
      "properties": {
        "schema": {
          "type": "array",
          "items": {
            "type": "string"
          },
          "description": "A DDL-formatted schema. Schema can be used to coerce values into more appropriate data types."
        }
      }
    }
  },
  "oneOf": [
    {
      "$ref": "#/$defs/Csv"
    },
    {
      "$ref": "#/$defs/GeoJson"
    },
    {
      "$ref": "#/$defs/EsriShapefile"
    },
    {
      "$ref": "#/$defs/Parquet"
    },
    {
      "$ref": "#/$defs/Json"
    },
    {
      "$ref": "#/$defs/NdJson"
    },
    {
      "$ref": "#/$defs/NdGeoJson"
    }
  ]
}