In [0]:
"""
%fs
ls /FileStore/tables/

dbfs:/FileStore/tables/line_delimited_json.json
dbfs:/FileStore/tables/single_file_with_extra_field.json
dbfs:/FileStore/tables/corrupted_json.json
dbfs:/FileStore/tables/multiline_correct.json
dbfs:/FileStore/tables/multiline_incorrect.json
"""



"""
{"name":"Manish","age":20,"salary":20000},
{"name":"Nikita","age":25,"salary":21000},
{"name":"Pritam","age":16,"salary":22000},
{"name":"Prantosh","age":35,"salary":25000},
{"name":"Vikash","age":67,"salary":40000}
"""

line_delimited_df = spark.read.format("json")\
    .option("inferschema", "true")\
    .option("mode", "PERMISSIVE")\
    .load("dbfs:/FileStore/tables/line_delimited_json.json")

line_delimited_df.show()
"""
+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+
"""


+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+

Out[2]: '\n+---+--------+------+\n|age|    name|salary|\n+---+--------+------+\n| 20|  Manish| 20000|\n| 25|  Nikita| 21000|\n| 16|  Pritam| 22000|\n| 35|Prantosh| 25000|\n| 67|  Vikash| 40000|\n+---+--------+------+\n'

In [0]:
"""
{"name":"Manish","age":20,"salary":20000},
{"name":"Nikita","age":25,"salary":21000},
{"name":"Pritam","age":16,"salary":22000},
{"name":"Prantosh","age":35,"salary":25000},
{"name":"Vikash","age":67,"salary":40000,"gender":"M"}
"""

line_delimited_with_extra_col_df = spark.read.format("json")\
    .option("inferschema", "true")\
    .option("mode", "PERMISSIVE")\
    .load("dbfs:/FileStore/tables/single_file_with_extra_field.json")

line_delimited_with_extra_col_df.show()
"""
Note that there is no error, for extra field in just 1 rec
If found extra col value then populate else populate as null
+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  null|  Manish| 20000|
| 25|  null|  Nikita| 21000|
| 16|  null|  Pritam| 22000|
| 35|  null|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+
"""


+---+------+--------+------+
|age|gender|    name|salary|
+---+------+--------+------+
| 20|  null|  Manish| 20000|
| 25|  null|  Nikita| 21000|
| 16|  null|  Pritam| 22000|
| 35|  null|Prantosh| 25000|
| 67|     M|  Vikash| 40000|
+---+------+--------+------+



In [0]:
"""
Line delimited - {k1:v1},{k2:v2}
Line delimited can be read line by line

MultiLine - {
    k1:v1
},
{
    k2:v2
}
multiline cannot be read line by line, all content/data should go in one single object and that is read by the engine
"""

In [0]:
"""
[
{
  "name": "Manish",
  "age": 20,
  "salary": 20000
},
{
  "name": "Nikita",
  "age": 25,
  "salary": 21000
},
{
  "name": "Pritam",
  "age": 16,
  "salary": 22000
},
{
  "name": "Prantosh",
  "age": 35,
  "salary": 25000
},
{
  "name": "Vikash",
  "age": 67,
  "salary": 40000
}
]
"""
multiline_correct_df = spark.read.format("json")\
    .option("inferschema", "true")\
    .option("mode", "PERMISSIVE")\
    .option("multiline", "true")\
    .load("dbfs:/FileStore/tables/multiline_correct.json")

multiline_correct_df.show()
"""
Need to pass .option("multiline", "true") else will throw Analysis Exception
+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+
"""

+---+--------+------+
|age|    name|salary|
+---+--------+------+
| 20|  Manish| 20000|
| 25|  Nikita| 21000|
| 16|  Pritam| 22000|
| 35|Prantosh| 25000|
| 67|  Vikash| 40000|
+---+--------+------+



In [0]:
"""
{
  "name": "Manish",
  "age": 20,
  "salary": 20000
},
{
  "name": "Nikita",
  "age": 25,
  "salary": 21000
},
{
  "name": "Pritam",
  "age": 16,
  "salary": 22000
},
{
  "name": "Prantosh",
  "age": 35,
  "salary": 25000
},
{
  "name": "Vikash",
  "age": 67,
  "salary": 40000
}
"""

multiline_incorrect_df = spark.read.format("json")\
    .option("inferschema", "true")\
    .option("mode", "PERMISSIVE")\
    .option("multiline", "true")\
    .load("dbfs:/FileStore/tables/multiline_incorrect.json")

multiline_incorrect_df.show()
"""
Read only 1st JSON document/dictionary. Sample json is nothing but list of documents but never enclosed inside an Array/List obj
+---+------+------+
|age|  name|salary|
+---+------+------+
| 20|Manish| 20000|
+---+------+------+
"""

+---+------+------+
|age|  name|salary|
+---+------+------+
| 20|Manish| 20000|
+---+------+------+



In [0]:
"""
{"name":"Manish","age":20,"salary":20000},
{"name":"Nikita","age":25,"salary":21000},
{"name":"Pritam","age":16,"salary":22000},
{"name":"Prantosh","age":35,"salary":25000},
{"name":"Vikash","age":67,"salary":40000
"""

corrupted_json_df = spark.read.format("json")\
    .option("inferschema", "true")\
    .option("mode", "PERMISSIVE")\
    .load("dbfs:/FileStore/tables/corrupted_json.json")

corrupted_json_df.show(truncate=False)
"""
Closing curly bracket is missing for name:Vikash
+----------------------------------------+----+--------+------+
|_corrupt_record                         |age |name    |salary|
+----------------------------------------+----+--------+------+
|null                                    |20  |Manish  |20000 |
|null                                    |25  |Nikita  |21000 |
|null                                    |16  |Pritam  |22000 |
|null                                    |35  |Prantosh|25000 |
|{"name":"Vikash","age":67,"salary":40000|null|null    |null  |
+----------------------------------------+----+--------+------+
"""



+----------------------------------------+----+--------+------+
|_corrupt_record                         |age |name    |salary|
+----------------------------------------+----+--------+------+
|null                                    |20  |Manish  |20000 |
|null                                    |25  |Nikita  |21000 |
|null                                    |16  |Pritam  |22000 |
|null                                    |35  |Prantosh|25000 |
|{"name":"Vikash","age":67,"salary":40000|null|null    |null  |
+----------------------------------------+----+--------+------+

