In [70]:
from pyspark.sql.functions import col, explode, map_from_arrays, struct, arrays_zip
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL Example").getOrCreate()

In [9]:
amazon_transactions_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .options(sep=",") \
    .load("data/input/amazon_transactions.csv")

amazon_transactions_df.show(5)

+---+-------+-------+----------+-------+
| id|user_id|   item|created_at|revenue|
+---+-------+-------+----------+-------+
|  1|    109|   milk|2020-03-03|    123|
|  2|    139|biscuit|2020-03-18|    421|
|  3|    120|   milk|2020-03-18|    176|
|  4|    108| banana|2020-03-18|    862|
|  5|    130|   milk|2020-03-28|    333|
+---+-------+-------+----------+-------+
only showing top 5 rows


In [10]:
amazon_transactions_df1 = spark.read.format("csv") \
    .options(header="true"
            , inferSchema="true"
            , sep=",") \
    .load("data/input/amazon_transactions.csv")

amazon_transactions_df1.show(5)

+---+-------+-------+----------+-------+
| id|user_id|   item|created_at|revenue|
+---+-------+-------+----------+-------+
|  1|    109|   milk|2020-03-03|    123|
|  2|    139|biscuit|2020-03-18|    421|
|  3|    120|   milk|2020-03-18|    176|
|  4|    108| banana|2020-03-18|    862|
|  5|    130|   milk|2020-03-28|    333|
+---+-------+-------+----------+-------+
only showing top 5 rows


In [16]:
# Read Json

sample_json_df = spark.read.format("json") \
    .options(multiline="true") \
    .load("data/input/sample1.json")

sample_json_df.show(5)

+-----+-----+-----+
|color|fruit| size|
+-----+-----+-----+
|  Red|Apple|Large|
+-----+-----+-----+



In [22]:
sample_json_df2 = spark.read.format("json") \
    .load("data/input/sample2.json") \
    

sample_json_df2.show(5)

+-----+-----+-----+
|color|fruit| size|
+-----+-----+-----+
|  Red|Apple|Large|
+-----+-----+-----+



In [26]:
sample_json_color_df = spark.read.format("json") \
    .options(multiline="true") \
    .load("data/input/sample3.json") \

sample_json_color_df.show(5)

+-------+-----+
|  color|value|
+-------+-----+
|    red| #f00|
|  green| #0f0|
|   blue| #00f|
|   cyan| #0ff|
|magenta| #f0f|
+-------+-----+
only showing top 5 rows


In [28]:
json_df4 = spark.read.format("json") \
    .options(multiline="true") \
    .load("data/input/sample4.json")

json_df4.show(5)

+--------------------+
|              people|
+--------------------+
|[{28, Joe, male, ...|
+--------------------+



In [39]:
json_df4.select("people.age", "people.lastName").show(5)

+------------+--------------------+
|         age|            lastName|
+------------+--------------------+
|[28, 32, 24]|[Jackson, Smith, ...|
+------------+--------------------+



In [34]:
json_df4.select(explode("people.age").alias("age"), explode("people.firstName").alias("firstName")).show(5)

+---+---------+
|age|firstName|
+---+---------+
| 28|      Joe|
| 28|    James|
| 28|    Emily|
| 32|      Joe|
| 32|    James|
+---+---------+
only showing top 5 rows


In [None]:
json_df4_map = json_df4.withColumn(
    "firstName_age_map",
    map_from_arrays(col("people.firstName"), col("people.age")))\
    .select("firstName_age_map")

json_df4_map.select(
    explode("firstName_age_map").alias("firstName", "age")
).show(5, False)

+---------+---+
|firstName|age|
+---------+---+
|Joe      |28 |
|James    |32 |
|Emily    |24 |
+---------+---+



In [59]:
json_df4.select(
    explode(map_from_arrays(col("people.firstName"), col("people.age"))).alias("firstName", "age")
).show(5, False)

+---------+---+
|firstName|age|
+---------+---+
|Joe      |28 |
|James    |32 |
|Emily    |24 |
+---------+---+



In [61]:
json_df4.select(col("people.firstName"), col("people.lastName"), col("people.age")).show(5, False)

+-------------------+-----------------------+------------+
|firstName          |lastName               |age         |
+-------------------+-----------------------+------------+
|[Joe, James, Emily]|[Jackson, Smith, Jones]|[28, 32, 24]|
+-------------------+-----------------------+------------+



In [67]:
json_df4_struct = json_df4.withColumn(
    "name", 
    map_from_arrays(
        col("people.firstName"), col("people.lastName"))
    )



json_df4_struct.select(explode("name").alias("firstName", "lastName")).show(5, False)

+---------+--------+
|firstName|lastName|
+---------+--------+
|Joe      |Jackson |
|James    |Smith   |
|Emily    |Jones   |
+---------+--------+



In [73]:
json_df4.show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------+
|people                                                                                                             |
+-------------------------------------------------------------------------------------------------------------------+
|[{28, Joe, male, Jackson, 7349282382}, {32, James, male, Smith, 5678568567}, {24, Emily, female, Jones, 456754675}]|
+-------------------------------------------------------------------------------------------------------------------+



In [79]:
json_df4.select(col("people.firstName"), 
                col("people.age"), 
                col("people.lastName"), 
                col("people.number"))\
    .show(5, False)

json_df4_zipped = json_df4.withColumn(
    "zipped",
    arrays_zip(col("people.firstName"), col("people.age"), col("people.lastName"), col("people.number"))
)\
.select(explode("zipped").alias("z"))\
.select(col("z.firstName"), col("z.age"), col("z.lastName"), col("z.number"))\
.show(5, False)

+-------------------+------------+-----------------------+-----------------------------------+
|firstName          |age         |lastName               |number                             |
+-------------------+------------+-----------------------+-----------------------------------+
|[Joe, James, Emily]|[28, 32, 24]|[Jackson, Smith, Jones]|[7349282382, 5678568567, 456754675]|
+-------------------+------------+-----------------------+-----------------------------------+

+---------+---+--------+----------+
|firstName|age|lastName|number    |
+---------+---+--------+----------+
|Joe      |28 |Jackson |7349282382|
|James    |32 |Smith   |5678568567|
|Emily    |24 |Jones   |456754675 |
+---------+---+--------+----------+



In [None]:
json_df4.select(
    explode(arrays_zip(col("people.firstName"), col("people.age"), col("people.lastName"), col("people.number"))).alias("z")
).select(col("z.firstName"), col("z.age"), col("z.lastName"), col("z.number"))\
.show(5, False)

+---------+---+--------+----------+
|firstName|age|lastName|number    |
+---------+---+--------+----------+
|Joe      |28 |Jackson |7349282382|
|James    |32 |Smith   |5678568567|
|Emily    |24 |Jones   |456754675 |
+---------+---+--------+----------+



In [81]:
json_df4_1 = spark.read.format("json") \
    .options(multiline="true") \
    .load("data/input/sample4_1.json")

json_df4_1.show(5, False)

+--------------------------------------------------------------------------------------------------------------+
|people                                                                                                        |
+--------------------------------------------------------------------------------------------------------------+
|[{28, Joe, male, Jackson, 7349282382}, {32, James, male, Smith, 5678568567}, {24, Emily, female, Jones, NULL}]|
+--------------------------------------------------------------------------------------------------------------+



In [93]:
json_df4_1.withColumn('number', col("people.number"))\
    .withColumn('firstName', col("people.firstName"))\
    .withColumn('lastName', col("people.lastName"))\
    .withColumn('gender', col("people.gender"))\
    .withColumn('age', col("people.age"))\
    .drop(col("people"))\
    .show(5, False)

+------------------------------+-------------------+-----------------------+--------------------+--------------+
|number                        |firstName          |lastName               |gender              |age           |
+------------------------------+-------------------+-----------------------+--------------------+--------------+
|[7349282382, 5678568567, NULL]|[Joe, James, Emily]|[Jackson, Smith, Jones]|[NULL, male, female]|[28, NULL, 24]|
+------------------------------+-------------------+-----------------------+--------------------+--------------+



In [96]:
json_df4_1.select(
    explode(arrays_zip(col("people.firstName"), col("people.age"), col("people.lastName"), col("people.number"), col("people.gender"))).alias("z")
).select(col("z.firstName"), col("z.age"), col("z.lastName"), col("z.number"), col("z.gender"))\
.show(5, False)

+---------+----+--------+----------+------+
|firstName|age |lastName|number    |gender|
+---------+----+--------+----------+------+
|Joe      |28  |Jackson |7349282382|NULL  |
|James    |NULL|Smith   |5678568567|male  |
|Emily    |24  |Jones   |NULL      |female|
+---------+----+--------+----------+------+

