### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Trabajar con archivos JSON

In [0]:
dbutils.fs.mkdirs("/FileStore/tables/json/")

Out[52]: True

#### Leer un archivo JSON

##### Ejemplo 1

In [0]:
json_df = spark.read.option('inferschema','true').json('/FileStore/tables/json/constructor.json')

json_df.show(5, truncate=False)

+-------------+--------------+----------+-----------+------------------------------------------------------------+
|constructorId|constructorRef|name      |nationality|url                                                         |
+-------------+--------------+----------+-----------+------------------------------------------------------------+
|1            |mclaren       |McLaren   |British    |http://en.wikipedia.org/wiki/McLaren                        |
|2            |bmw_sauber    |BMW Sauber|German     |http://en.wikipedia.org/wiki/BMW_Sauber                     |
|3            |williams      |Williams  |British    |http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering|
|4            |renault       |Renault   |French     |http://en.wikipedia.org/wiki/Renault_in_Formula_One         |
|5            |toro_rosso    |Toro Rosso|Italian    |http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso            |
+-------------+--------------+----------+-----------+---------------------------

Definiendo un schema obtenemos el mismo resultado anterior

In [0]:
constructor_schema = 'constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING'

json_df = spark.read.schema(constructor_schema).json('/FileStore/tables/json/constructor.json')

json_df.show(5, truncate=False)

+-------------+--------------+----------+-----------+------------------------------------------------------------+
|constructorId|constructorRef|name      |nationality|url                                                         |
+-------------+--------------+----------+-----------+------------------------------------------------------------+
|1            |mclaren       |McLaren   |British    |http://en.wikipedia.org/wiki/McLaren                        |
|2            |bmw_sauber    |BMW Sauber|German     |http://en.wikipedia.org/wiki/BMW_Sauber                     |
|3            |williams      |Williams  |British    |http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering|
|4            |renault       |Renault   |French     |http://en.wikipedia.org/wiki/Renault_in_Formula_One         |
|5            |toro_rosso    |Toro Rosso|Italian    |http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso            |
+-------------+--------------+----------+-----------+---------------------------

##### Ejemplo 2

In [0]:
json_df = spark.read.option('inferschema','true').json('/FileStore/tables/json/drivers.json')

json_df.show(5, truncate=False)

+----+----------+--------+----------+--------------------+-----------+------+----------------------------------------------+
|code|dob       |driverId|driverRef |name                |nationality|number|url                                           |
+----+----------+--------+----------+--------------------+-----------+------+----------------------------------------------+
|HAM |1985-01-07|1       |hamilton  |{Lewis, Hamilton}   |British    |44    |http://en.wikipedia.org/wiki/Lewis_Hamilton   |
|HEI |1977-05-10|2       |heidfeld  |{Nick, Heidfeld}    |German     |\N    |http://en.wikipedia.org/wiki/Nick_Heidfeld    |
|ROS |1985-06-27|3       |rosberg   |{Nico, Rosberg}     |German     |6     |http://en.wikipedia.org/wiki/Nico_Rosberg     |
|ALO |1981-07-29|4       |alonso    |{Fernando, Alonso}  |Spanish    |14    |http://en.wikipedia.org/wiki/Fernando_Alonso  |
|KOV |1981-10-19|5       |kovalainen|{Heikki, Kovalainen}|Finnish    |\N    |http://en.wikipedia.org/wiki/Heikki_Kovalainen|


In [0]:
# Vamos a crear el schema para leer este archivo json
from pyspark.sql.types import *
from pyspark.sql.functions import *

name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True),
                                 ])

drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                    StructField("driverRef", StringType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", DateType(), True),
                                    StructField("nationality", StringType(), True),
                                    StructField("url", StringType(), True)
                                   ])

json_df = spark.read.format('json').schema(drivers_schema).json('/FileStore/tables/json/drivers.json')
json_df.printSchema()
json_df.show(5, truncate=False)

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)

+--------+----------+------+----+--------------------+----------+-----------+----------------------------------------------+
|driverId|driverRef |number|code|name                |dob       |nationality|url                                           |
+--------+----------+------+----+--------------------+----------+-----------+----------------------------------------------+
|1       |hamilton  |44    |HAM |{Lewis, Hamilton}   |1985-01-07|British    |http://en.wikipedia.org/wiki/Lewis_Hamilton   |
|2       |heidfeld  |null  |HEI |{Nick, Heidfeld}    |1977-05-10|German     |http://en.wikipedia.org/

In [0]:
json_df_modif = json_df.withColumnRenamed('driverId','driver_Id'). \
                        withColumnRenamed('driverRef','driver_Ref'). \
                        withColumn('name',concat(col('name.forename'), lit(' '), col('name.surname')))

json_df_modif.printSchema()
json_df_modif.show(5, truncate=False)

root
 |-- driver_Id: integer (nullable = true)
 |-- driver_Ref: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)

+---------+----------+------+----+-----------------+----------+-----------+----------------------------------------------+
|driver_Id|driver_Ref|number|code|name             |dob       |nationality|url                                           |
+---------+----------+------+----+-----------------+----------+-----------+----------------------------------------------+
|1        |hamilton  |44    |HAM |Lewis Hamilton   |1985-01-07|British    |http://en.wikipedia.org/wiki/Lewis_Hamilton   |
|2        |heidfeld  |null  |HEI |Nick Heidfeld    |1977-05-10|German     |http://en.wikipedia.org/wiki/Nick_Heidfeld    |
|3        |rosberg   |6     |ROS |Nico Rosberg     |1985-06-27|German    

##### Ejemplo 3

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

pit_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                StructField("driverId", IntegerType(), True),
                                StructField("stop", StringType(), True),
                                StructField("lap", IntegerType(), True),
                                StructField("time", StringType(), True),
                                StructField("duration", StringType(), True),
                                StructField("miliseconds", IntegerType(), True)
                                ])

json_df = spark.read.format('json').schema(pit_schema).json('/FileStore/tables/json/pit_stops.json')
json_df.printSchema()
json_df.show(5, truncate=False)

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- miliseconds: integer (nullable = true)

+------+--------+----+----+----+--------+-----------+
|raceId|driverId|stop|lap |time|duration|miliseconds|
+------+--------+----+----+----+--------+-----------+
|null  |null    |null|null|null|null    |null       |
|null  |null    |null|null|null|null    |null       |
|null  |null    |null|null|null|null    |null       |
|null  |null    |null|null|null|null    |null       |
|null  |null    |null|null|null|null    |null       |
+------+--------+----+----+----+--------+-----------+
only showing top 5 rows



In [0]:
from pyspark.sql.types import IntegerType, StringType

pit_schema = StructType(fields=[StructField("raceId", IntegerType(), True),
                                StructField("driverId", IntegerType(), True),
                                StructField("stop", StringType(), True),
                                StructField("lap", IntegerType(), True),
                                StructField("time", StringType(), True),
                                StructField("duration", StringType(), True),
                                StructField("miliseconds", IntegerType(), True)
                                ])

json_df = spark.read.schema(pit_schema).option('multiline',True).json('/FileStore/tables/json/pit_stops.json')
json_df.printSchema()
json_df.show(5, truncate=False)

root
 |-- raceId: integer (nullable = true)
 |-- driverId: integer (nullable = true)
 |-- stop: string (nullable = true)
 |-- lap: integer (nullable = true)
 |-- time: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- miliseconds: integer (nullable = true)

+------+--------+----+---+--------+--------+-----------+
|raceId|driverId|stop|lap|time    |duration|miliseconds|
+------+--------+----+---+--------+--------+-----------+
|841   |153     |1   |1  |17:05:23|26.898  |null       |
|841   |30      |1   |1  |17:05:52|25.021  |null       |
|841   |17      |1   |11 |17:20:48|23.426  |null       |
|841   |4       |1   |12 |17:22:34|23.251  |null       |
|841   |13      |1   |13 |17:24:10|23.842  |null       |
+------+--------+----+---+--------+--------+-----------+
only showing top 5 rows



##### Ejemplo 4

In [0]:
from pyspark.sql.functions import *

json_df = spark.read.option('multiline',True).json('/FileStore/tables/json/perros.json')
json_df.printSchema()
display(json_df)

root
 |-- persons: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- dogs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- name: string (nullable = true)



persons
"List(List(30, List(Fido, Fluffy), Keith), List(46, List(Spot), Donna))"


In [0]:
json_df_explode = json_df.select(explode(json_df.persons).alias('persons'))
json_df_explode.display()

persons
"List(30, List(Fido, Fluffy), Keith)"
"List(46, List(Spot), Donna)"


In [0]:
owners = json_df_explode.select(col('persons.name').alias('owner'), col('persons.age').alias('age'),                                                                                           explode('persons.dogs').alias('dog'))

owners.printSchema()
owners.display()

root
 |-- owner: string (nullable = true)
 |-- age: long (nullable = true)
 |-- dog: string (nullable = true)



owner,age,dog
Keith,30,Fido
Keith,30,Fluffy
Donna,46,Spot


##### Ejemplo 5

In [0]:
json_df = spark.read.option('inferschema','true').json('/FileStore/tables/json/accounting.json')
json_df.printSchema()
display(json_df)

root
 |-- accounting: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



accounting
"List(List(23, John, Doe), List(32, Mary, Smith))"


In [0]:
from pyspark.sql.functions import *

json_df = spark.read.option('multiline',True).json('/FileStore/tables/json/accounting.json')
json_df.printSchema()
display(json_df)

root
 |-- accounting: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- age: long (nullable = true)
 |    |    |-- first_name: string (nullable = true)
 |    |    |-- last_name: string (nullable = true)



accounting
"List(List(23, John, Doe), List(32, Mary, Smith))"


In [0]:
json_df_modif = json_df.select(explode(json_df.accounting).alias('account')). \
                        select('account.age','account.first_name','account.last_name')

display(json_df_modif)

age,first_name,last_name
23,John,Doe
32,Mary,Smith


#### Escribir un archivo JSON

##### Forma 1

In [0]:
json_df.coalesce(1).write.json('/FileStore/tables/json/', mode='overwrite')

##### Forma 2

In [0]:
json_df.coalesce(1).write.format('json').save('/FileStore/tables/json/', mode='overwrite')

##### Forma 3

In [0]:
json_df.write.json('/FileStore/tables/json/', mode='overwrite')

##### Forma 4

In [0]:
json_df.coalesce(1).write.format('json').save('/FileStore/tables/json/', mode='overwrite', compression='gzip')