In [0]:
help(spark.read)

Help on DataFrameReader in module pyspark.sql.readwriter object:

class DataFrameReader(OptionUtils)
 |  DataFrameReader(spark: 'SparkSession')
 |  
 |  Interface used to load a :class:`DataFrame` from external storage systems
 |  (e.g. file systems, key-value stores, etc). Use :attr:`SparkSession.read`
 |  to access this.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Support Spark Connect.
 |  
 |  Method resolution order:
 |      DataFrameReader
 |      OptionUtils
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, spark: 'SparkSession')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  csv(self, path: Union[str, List[str]], schema: Union[pyspark.sql.types.StructType, str, NoneType] = None, sep: Optional[str] = None, encoding: Optional[str] = None, quote: Optional[str] = None, escape: Optional[str] = None, comment: Optional[str] = None, header: Union[bool, str, NoneType] = None, inferSchema: Union

In [0]:
df = spark.read.json(path='dbfs:/FileStore/data/emps.json')
df.printSchema()
df.show()

root
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)

+------+---+--------+------+
|gender| id|    name|salary|
+------+---+--------+------+
|  male|  1|John Doe|  2000|
|  male|  2|    Oggy| 20000|
+------+---+--------+------+



In [0]:
df1 = spark.read.json(path='dbfs:/FileStore/data/emps1.json')
df1.printSchema()
df1.show()

root
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)

+------+---+--------+------+
|gender| id|    name|salary|
+------+---+--------+------+
|  male|  1|John Doe|  2000|
|  male|  2|    Oggy| 20000|
+------+---+--------+------+



In [0]:
df2 = spark.read.json(path='dbfs:/FileStore/data/emps2.json')
df2.printSchema()
df2.show()

root
 |-- _corrupt_record: string (nullable = true)



[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2916291915317104>:3[0m
[1;32m      1[0m df2 [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mjson(path[38;5;241m=[39m[38;5;124m'[39m[38;5;124mdbfs:/FileStore/data/emps2.json[39m[38;5;124m'[39m)
[1;32m      2[0m df2[38;5;241m.[39mprintSchema()
[0;32m----> 3[0m df2[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     4

## Read Multiline JSON File ⬇️

In [0]:
df2 = spark.read.json(path='dbfs:/FileStore/data/emps2.json',multiLine=True)
df2.printSchema()
df2.show()

root
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)

+------+---+-----------+------+
|gender| id|       name|salary|
+------+---+-----------+------+
|  male|  1|Katana bhai|200000|
|  male|  2|       Oggy|  6000|
|  male|  3|David Putra|  8000|
|  male|  4|   Nobidora| 12000|
+------+---+-----------+------+



## Read Multiple JSON Files ⬇️

In [0]:
df3 = spark.read.json(['dbfs:/FileStore/data/emps.json','dbfs:/FileStore/data/emps1.json'])
df3.show()

+------+---+--------+------+
|gender| id|    name|salary|
+------+---+--------+------+
|  male|  1|John Doe|  2000|
|  male|  2|    Oggy| 20000|
|  male|  1|John Doe|  2000|
|  male|  2|    Oggy| 20000|
+------+---+--------+------+



In [0]:
df3 = spark.read.json(path=['dbfs:/FileStore/data/emps1.json','dbfs:/FileStore/data/emps3.json'])
df3.printSchema()
df3.show()

root
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)

+------+---+--------+------+
|gender| id|    name|salary|
+------+---+--------+------+
|  male|  1|John Doe|  2000|
|  male|  2|    Oggy| 20000|
|  male|  3|   guddu| 45000|
|  male|  4|    golu| 50000|
+------+---+--------+------+



In [0]:
from pyspark.sql.types import *

**Enforcing Schema on top of dataframe**

In [0]:

schema = StructType().add(field="id",data_type=IntegerType())\
            .add(field="name",data_type=StringType())\
            .add(field="gender",data_type=StringType())\
            .add(field="salary",data_type=IntegerType())

df3 = spark.read.json(path=['dbfs:/FileStore/data/emps1.json','dbfs:/FileStore/data/emps3.json'], schema=schema)
df3.printSchema()
df3.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|John Doe|  male|  2000|
|  2|    Oggy|  male| 20000|
|  3|   guddu|  male| 45000|
|  4|    golu|  male| 50000|
+---+--------+------+------+

