In [0]:
from pyspark.sql.types import *

# Read multiline json file 1
data_df = spark.read.option("multiline","false") \
      .json("/FileStore/tables/JSON/examples.json")
display(data_df) 

_id,calories_per_serving,cook_time,desc,directions,ingredients,likes,likes_count,prep_time,rating,rating_avg,servings,tags,title,type
List(5ee69d943260aab97ea0d58d),,,,,,,,,,,,,Pizza,
List(5ee69e393260aab97ea0d58e),,,,,,,,,,,,,Delete me,
List(5e5e9c470d33e9e8e3891b35),210.0,20.0,Classic Mexican tacos,"List(Brown beef, Add taco seasoning and water, mix, Bring to boil, Lower heat to simmer 5-10 minutes until desired consistency, Put meat in tacos)","List(List(ground beef (lean), List(1, lbs)), List(taco seasoning, List(2, oz)), List(corn hard tacos, List(12, oz)))","List(1, 415)",2.0,10.0,"List(4, 4, 3, 4, 2, 5, 2, 2, 4, 5)",3.5,4.0,"List(mexican, quick, easy, ground beef)",Tacos,Dinner


In [0]:
# directions columns contains Array lets try to flat it out...

#Using SQL col() function
from pyspark.sql.functions import col
data_df2= data_df["title","directions"].filter(col("title")=="Tacos")

display(data_df2)

from pyspark.sql.functions import explode
display(  data_df2.select(data_df2.title,explode(data_df2.directions))  ) # explode function used for flat it out...

data_df3=data_df2.select(data_df2.title,explode(data_df2.directions))

title,directions
Tacos,"List(Brown beef, Add taco seasoning and water, mix, Bring to boil, Lower heat to simmer 5-10 minutes until desired consistency, Put meat in tacos)"


title,col
Tacos,Brown beef
Tacos,"Add taco seasoning and water, mix"
Tacos,Bring to boil
Tacos,Lower heat to simmer 5-10 minutes until desired consistency
Tacos,Put meat in tacos


In [0]:
#Use array() function to create a new array column by merging the data from multiple columns.

from pyspark.sql.functions import array

display( data_df3.select(data_df3.title,array(data_df3.title,data_df3.col).alias("Dummy_array")) )

#display( data_df3.select(data_df3.title,array(data_df3.title,data_df3.col)[1].alias("Dummy_array")) )


title,Dummy_array
Tacos,"List(Tacos, Brown beef)"
Tacos,"List(Tacos, Add taco seasoning and water, mix)"
Tacos,"List(Tacos, Bring to boil)"
Tacos,"List(Tacos, Lower heat to simmer 5-10 minutes until desired consistency)"
Tacos,"List(Tacos, Put meat in tacos)"


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType

data_schema = StructType(
  [
    StructField("Title",StringType(),True),
    StructField("Desc",ArrayType(StringType()),True)
  ]
)



# using StructType grammar we need a list as input data  thats why below conversion needed
import numpy as np
x=(data_df2.collect()) # collect retrieves all elements in a DataFrame as an Array
data_df4 = spark.createDataFrame(data=x,schema=data_schema) # we use Array x as data input -- we cant use another dataframe as input
data_df4.printSchema()
#display(data_df4)

print(x)
print(data_df4)

root
 |-- Title: string (nullable = true)
 |-- Desc: array (nullable = true)
 |    |-- element: string (containsNull = true)

[Row(title='Tacos', directions=['Brown beef', 'Add taco seasoning and water, mix', 'Bring to boil', 'Lower heat to simmer 5-10 minutes until desired consistency', 'Put meat in tacos'])]
DataFrame[Title: string, Desc: array<string>]


In [0]:

a1= data_df3.select(col("Title")).toPandas()['Title'].tolist() # instead of using collect to convert , using toPandas and then using tolist to convert from dataframe to list

print(type(a1))

# using collect func. bcz of retrieving all data , it can cause out of memory error in the case of big dataset

<class 'list'>


In [0]:
display(data_df3)

title,col
Tacos,Brown beef
Tacos,"Add taco seasoning and water, mix"
Tacos,Bring to boil
Tacos,Lower heat to simmer 5-10 minutes until desired consistency
Tacos,Put meat in tacos


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType

data_schema = StructType(
  [
    StructField("Title",StringType(),True),
    StructField("Desc",StringType(),True)
  ]
)


y=data_df3.select(col("Title"),col('col')).toPandas().values.tolist() # instead of collect we use toPandas and values to covert dataframe into a list
data_df4 = spark.createDataFrame(data=y,schema=data_schema) # we use Array x as data input -- we cant use another dataframe as input
data_df4.printSchema()
#display(data_df4)

print(x)
display(data_df4)

root
 |-- Title: string (nullable = true)
 |-- Desc: string (nullable = true)

[Row(title='Tacos', directions=['Brown beef', 'Add taco seasoning and water, mix', 'Bring to boil', 'Lower heat to simmer 5-10 minutes until desired consistency', 'Put meat in tacos'])]


Title,Desc
Tacos,Brown beef
Tacos,"Add taco seasoning and water, mix"
Tacos,Bring to boil
Tacos,Lower heat to simmer 5-10 minutes until desired consistency
Tacos,Put meat in tacos


In [0]:
#using MapType  as input
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, MapType

schema = StructType([
    StructField('name', StringType(), True),
    StructField('properties', MapType(StringType(),StringType()),True)
])

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]
df = spark.createDataFrame(data=dataDictionary, schema = schema)
df.printSchema()
df.show(truncate=False)


display( df.select(col('name') ,col('properties').eye.alias("eye") , col('properties').hair.alias("hair"))  )

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |{eye -> brown, hair -> black}|
|Michael   |{eye -> null, hair -> brown} |
|Robert    |{eye -> black, hair -> red}  |
|Washington|{eye -> grey, hair -> grey}  |
|Jefferson |{eye -> , hair -> brown}     |
+----------+-----------------------------+



name,eye,hair
James,brown,black
Michael,,brown
Robert,black,red
Washington,grey,grey
Jefferson,,brown


In [0]:
#for loop implementation
print(type(df))
df2=df.select(col('name') ,col('properties').eye.alias("eye") , col('properties').hair.alias("hair"))
print(type(df2))
df3=df2.toPandas()
print(type(df3))

for index, row in df3.iterrows(): # itterrow only works with pandas.dataframe
    print("\n")
    print(index)
    print(row['name'], row['hair'])

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


0
James black


1
Michael brown


2
Robert red


3
Washington grey


4
Jefferson brown


In [0]:
x=(df2.collect())  # collect function can work with <class 'pyspark.sql.dataframe.DataFrame'>

#y=(df3.collect())  # if type is <class 'pandas.core.frame.DataFrame'> we cant use collect function

display(df2.sample(0.20) ) # to get 20% sample records

name,eye,hair
Jefferson,,brown


In [0]:
# working on Parquet File 

#Apache Parquet file is a columnar storage format available to any project in the Hadoop ecosystem
#While querying columnar storage, it skips the nonrelevant data very quickly, making faster query execution. 
#As a result aggregation queries consume less time compared to row-oriented databases.



data =[("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]
columns=["firstname","middlename","lastname","dob","gender","salary"]
dfp=spark.createDataFrame(data,columns)

# write into parq. file
dfp.write.mode('overwrite').parquet("/tmp/output/people.parquet")

# read from parq.
parDF=spark.read.parquet("/tmp/output/people.parquet")


display(parDF)

#execute as sql

parDF.createOrReplaceTempView("ParquetTable")
parkSQL = spark.sql("select * from ParquetTable where salary >= 4000 ")

#CREATE TABLE USING PARQ. FILE
spark.sql("CREATE or REPLACE TEMPORARY VIEW PERSON USING parquet OPTIONS (path \"/tmp/output/people.parquet\")")
spark.sql("SELECT * FROM PERSON").show()


#CREATE PARTITIONED PARQ. file
dfp.write.partitionBy("gender","salary").mode("overwrite").parquet("/tmp/output/people2.parquet")


firstname,middlename,lastname,dob,gender,salary
Robert,,Williams,42114.0,M,4000
Maria,Anne,Jones,39192.0,F,4000
Michael,Rose,,40288.0,M,4000
James,,Smith,36636.0,M,3000
Jen,Mary,Brown,,F,-1


+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|  dob|gender|salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [0]:
#to_json() function is used to convert DataFrame columns MapType or Struct type to JSON string,,


#display(df)

print( df.printSchema()  )

from pyspark.sql.functions import to_json,col

df.withColumn("properties",to_json(col("properties"))).show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

None
+----------+------------------------------+
|name      |properties                    |
+----------+------------------------------+
|James     |{"eye":"brown","hair":"black"}|
|Michael   |{"eye":null,"hair":"brown"}   |
|Robert    |{"eye":"black","hair":"red"}  |
|Washington|{"eye":"grey","hair":"grey"}  |
|Jefferson |{"eye":"","hair":"brown"}     |
+----------+------------------------------+



In [0]:
#overlay() Function
#Replace column value with a string value from another column.

from pyspark.sql.functions import overlay
df = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df.select(overlay("col1", "col2",0).alias("overlayed")).show()
df.select(overlay("col1", "col2",1).alias("overlayed")).show()
df.select(overlay("col1", "col2",6).alias("overlayed")).show()


x=df.select(overlay("col1", "col2",6).alias("overlayed")).toPandas().values.tolist() 

print(type(x))

+----------+
| overlayed|
+----------+
|FGHCDE_XYZ|
+----------+

+---------+
|overlayed|
+---------+
|FGHDE_XYZ|
+---------+

+---------+
|overlayed|
+---------+
|ABCDEFGHZ|
+---------+

<class 'list'>


In [0]:
# read json example 2
from pyspark.sql.types import *

# Read multiline json file 1
data_df = spark.read.option("multiline","true").json("/FileStore/tables/JSON/Ex1.json")
display(data_df) 

print(data_df.printSchema() )


batters,id,name,ppu,topping,type
"List(List(List(1001, Regular), List(1002, Chocolate), List(1003, Blueberry), List(1004, Devil's Food)))",1,Cake,0.55,"List(List(5001, None), List(5002, Glazed), List(5005, Sugar), List(5007, Powdered Sugar), List(5006, Chocolate with Sprinkles), List(5003, Chocolate), List(5004, Maple))",donut
"List(List(List(1001, Regular)))",2,Raised,0.55,"List(List(5001, None), List(5002, Glazed), List(5005, Sugar), List(5003, Chocolate), List(5004, Maple))",donut
"List(List(List(1001, Regular), List(1002, Chocolate)))",3,Old Fashioned,0.55,"List(List(5001, None), List(5002, Glazed), List(5003, Chocolate), List(5004, Maple))",donut


root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)

None


In [0]:
#using MapType  as input
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, MapType,ArrayType
from pyspark.sql.functions import col
schema = StructType([
  StructField('id',StringType()),
  StructField('batters', StructType([
    
        StructField(
        'batter', ArrayType(
            StructType([
                StructField('id', StringType(), True),
                StructField('type',StringType() , True) 
               
                
            ])
        )
    )
    
    
  ])  )
])

dfx=data_df[["id","batters"]].toPandas().values.tolist()


df = spark.createDataFrame(data=dfx, schema = schema)
display(   df.select(col("id"),col("batters").batter.id,col("batters").batter.type    )   )


  Unable to convert the field batters. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Unsupported type in conversion to Arrow: ArrayType(StructType(List(StructField(id,StringType,true),StructField(type,StringType,true))),true)
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


id,batters.batter.id,batters.batter.type
1,"List(1001, 1002, 1003, 1004)","List(Regular, Chocolate, Blueberry, Devil's Food)"
2,List(1001),List(Regular)
3,"List(1001, 1002)","List(Regular, Chocolate)"


In [0]:

df0=df.select(col("id"),col("batters").batter.id.alias("batter_id"),col("batters").batter.type.alias("batter_type"))  
display(df0)
#from pyspark.sql import functions as F
from pyspark.sql.functions import arrays_zip, col, explode

'''
Array_zip() : zip() is a Array class method which Converts any arguments to arrays, then merges elements of self with corresponding elements from each argument.
'''

df2 = df0.withColumn("new", arrays_zip("batter_id", "batter_type"))\
       .withColumn("new", explode("new"))\
       .select( col("id"),col("new.batter_id").alias("BATTER_ID"), col("new.batter_type").alias("BATTER_TYPE"))
display(df2)

id,batter_id,batter_type
1,"List(1001, 1002, 1003, 1004)","List(Regular, Chocolate, Blueberry, Devil's Food)"
2,List(1001),List(Regular)
3,"List(1001, 1002)","List(Regular, Chocolate)"


id,BATTER_ID,BATTER_TYPE
1,1001,Regular
1,1002,Chocolate
1,1003,Blueberry
1,1004,Devil's Food
2,1001,Regular
3,1001,Regular
3,1002,Chocolate


In [0]:
print(data_df.printSchema() )

root
 |-- batters: struct (nullable = true)
 |    |-- batter: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- type: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ppu: double (nullable = true)
 |-- topping: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- type: string (nullable = true)

None


In [0]:
#using MapType  as input
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, MapType,ArrayType
from pyspark.sql.functions import col
schema = StructType([
  StructField('id',StringType()),
        StructField(
        'topping', ArrayType(
            StructType([
                StructField('id', StringType(), True),
                StructField('type',StringType() , True)      
            ])
        )
    )
  ] )
dfy=data_df[["id","topping"]].toPandas().values.tolist()

df = spark.createDataFrame(data=dfy, schema = schema)

df1=  df.select(col("id"),col("topping").id.alias("TOP_ID"),col("topping").type.alias("TOP_TYPE")    )   

display(df1)

df3 = df1.withColumn("new", arrays_zip("TOP_ID", "TOP_TYPE"))\
       .withColumn("new", explode("new"))\
       .select( col("id"),col("new.TOP_ID").alias("TOP_ID"), col("new.TOP_TYPE").alias("TOP_TYPE"))
display(df3)



  Unable to convert the field topping. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Unsupported type in conversion to Arrow: ArrayType(StructType(List(StructField(id,StringType,true),StructField(type,StringType,true))),true)
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


id,TOP_ID,TOP_TYPE
1,"List(5001, 5002, 5005, 5007, 5006, 5003, 5004)","List(None, Glazed, Sugar, Powdered Sugar, Chocolate with Sprinkles, Chocolate, Maple)"
2,"List(5001, 5002, 5005, 5003, 5004)","List(None, Glazed, Sugar, Chocolate, Maple)"
3,"List(5001, 5002, 5003, 5004)","List(None, Glazed, Chocolate, Maple)"


id,TOP_ID,TOP_TYPE
1,5001,
1,5002,Glazed
1,5005,Sugar
1,5007,Powdered Sugar
1,5006,Chocolate with Sprinkles
1,5003,Chocolate
1,5004,Maple
2,5001,
2,5002,Glazed
2,5005,Sugar


In [0]:
display(df2)

display(df3)

print(df3.toPandas().index)

id,BATTER_ID,BATTER_TYPE
1,1001,Regular
1,1002,Chocolate
1,1003,Blueberry
1,1004,Devil's Food
2,1001,Regular
3,1001,Regular
3,1002,Chocolate


id,TOP_ID,TOP_TYPE
1,5001,
1,5002,Glazed
1,5005,Sugar
1,5007,Powdered Sugar
1,5006,Chocolate with Sprinkles
1,5003,Chocolate
1,5004,Maple
2,5001,
2,5002,Glazed
2,5005,Sugar


RangeIndex(start=0, stop=16, step=1)


In [0]:
df4=df2.toPandas().merge(df3[["TOP_ID","TOP_TYPE"]].toPandas(), left_index=True, right_index=True ,how="inner").sort_index()

display(df4)

# convert pandas dataframe to spark dataframe
sparkDF2 = spark.createDataFrame(df4)

display(sparkDF2)

id,BATTER_ID,BATTER_TYPE,TOP_ID,TOP_TYPE
1,1001,Regular,5001,
1,1002,Chocolate,5002,Glazed
1,1003,Blueberry,5005,Sugar
1,1004,Devil's Food,5007,Powdered Sugar
2,1001,Regular,5006,Chocolate with Sprinkles
3,1001,Regular,5003,Chocolate
3,1002,Chocolate,5004,Maple


id,BATTER_ID,BATTER_TYPE,TOP_ID,TOP_TYPE
1,1001,Regular,5001,
1,1002,Chocolate,5002,Glazed
1,1003,Blueberry,5005,Sugar
1,1004,Devil's Food,5007,Powdered Sugar
2,1001,Regular,5006,Chocolate with Sprinkles
3,1001,Regular,5003,Chocolate
3,1002,Chocolate,5004,Maple


In [0]:
#python unpack operator

#args * sign all allow us to get elements from a list
def names_tuple(*args):
    return max(args)

print( names_tuple(1,2,23) )

#kwargs used for returning dictionaries

def names_dict(**kwargs):
    return kwargs

names_dict(Jane = 'Doe', John = 'Smith')


23
Out[21]: {'Jane': 'Doe', 'John': 'Smith'}

In [0]:
# create column from already created dataframe

display(sparkDF2)



sparkDF3=sparkDF2[["TOP_TYPE"]].withColumn("TOP_TYPE_X",col("TOP_TYPE").substr(1, 3))\
          .drop("TOP_TYPE")
    

display(sparkDF3)

sparkDF4 = spark.createDataFrame([], StructType([]))
def substr_df(a):
    b=a+"_X"
    return sparkDF2[[a]].withColumn(b,col(a).substr(1, 3))\
          .drop(a)
 

sparkDF4=substr_df("TOP_ID")

display(sparkDF4)

id,BATTER_ID,BATTER_TYPE,TOP_ID,TOP_TYPE
1,1001,Regular,5001,
1,1002,Chocolate,5002,Glazed
1,1003,Blueberry,5005,Sugar
1,1004,Devil's Food,5007,Powdered Sugar
2,1001,Regular,5006,Chocolate with Sprinkles
3,1001,Regular,5003,Chocolate
3,1002,Chocolate,5004,Maple


TOP_TYPE_X
Non
Gla
Sug
Pow
Cho
Cho
Map


TOP_ID_X
500
500
500
500
500
500
500


In [0]:
# using  group by and create new dataframe 
# Since DataFrame’s are an immutable collection, you can’t rename or update a column 
# instead when using withColumnRenamed() it creates a new DataFrame with updated column names

df7=sparkDF2.groupBy("batter_type").count().withColumnRenamed('count', 'COUNT_OF_BATTER').withColumnRenamed('batter_type', 'bt')


display(sparkDF2)
display(df7)


df4=sparkDF2.toPandas().merge(df7.toPandas(),left_on='BATTER_TYPE', right_on='bt' ,how="inner")
    

# convert pandas dataframe to spark dataframe
df4 = spark.createDataFrame(df4).drop("bt")

#df4.drop("bt")

display(df4)


df4.createOrReplaceTempView("TABLE_1")

spark.sql("select DISTINCT TOP_TYPE ,count(1)  over ( partition by BATTER_TYPE) AS COUNT_BATTER from TABLE_1  where 1=1 ") \
    .show()


id,BATTER_ID,BATTER_TYPE,TOP_ID,TOP_TYPE
1,1001,Regular,5001,
1,1002,Chocolate,5002,Glazed
1,1003,Blueberry,5005,Sugar
1,1004,Devil's Food,5007,Powdered Sugar
2,1001,Regular,5006,Chocolate with Sprinkles
3,1001,Regular,5003,Chocolate
3,1002,Chocolate,5004,Maple


bt,COUNT_OF_BATTER
Regular,3
Chocolate,2
Blueberry,1
Devil's Food,1


id,BATTER_ID,BATTER_TYPE,TOP_ID,TOP_TYPE,COUNT_OF_BATTER
1,1001,Regular,5001,,3
2,1001,Regular,5006,Chocolate with Sprinkles,3
3,1001,Regular,5003,Chocolate,3
1,1002,Chocolate,5002,Glazed,2
3,1002,Chocolate,5004,Maple,2
1,1003,Blueberry,5005,Sugar,1
1,1004,Devil's Food,5007,Powdered Sugar,1


+--------------------+------------+
|            TOP_TYPE|COUNT_BATTER|
+--------------------+------------+
|               Maple|           2|
|      Powdered Sugar|           1|
|                None|           3|
|               Sugar|           1|
|              Glazed|           2|
|Chocolate with Sp...|           3|
|           Chocolate|           3|
+--------------------+------------+



In [0]:
# all rows fro column 2,3,4
df4.toPandas().iloc[:,2:5]

# all rows for column 2,5
df4.toPandas().iloc[:,[2,5]]

Unnamed: 0,BATTER_TYPE,COUNT_OF_BATTER
0,Regular,3
1,Regular,3
2,Regular,3
3,Chocolate,2
4,Chocolate,2
5,Blueberry,1
6,Devil's Food,1


In [0]:
#df4.cache()
spark.catalog.clearCache()


In [0]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()



root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



In [0]:
# File location and type
file_location = "/FileStore/tables/recipesX.json"
file_type = "json"

# read json example 3
from pyspark.sql.types import *

# Read multiline json file 1
data_df = spark.read.option("multiline","true").json("/FileStore/tables/recipesX.json")
display(data_df) 

print(data_df.printSchema() )

_id,calories_per_serving,comments,cook_time,desc,directions,high_alt,ingredients,likes,likes_count,prep_time,rating,rating_avg,servings,tags,title,type
List(5e6fd805fa98021236426a24),205,,19,Mexican soft tacos,"List(Put seasoning on chicken breasts, Grill until done, Chop chicken into peices, Put in totillas)",,"List(List(null, chicken breast, List(1, lbs)), List(null, taco seasoning, List(2, oz)), List(null, small flour totillas, List(12, oz)))","List(261, 1, 415)",3.0,10,"List(4, 4, 4, 4, 2, 5, 3)",3.71,5,"List(mexican, quick, easy, chicken)",Chicken Soft Tacos,Dinner
List(5e877cba20a4f574c0aa56da),232,"List(List(I love these! They are so fluffy!, List(2000-01-03T18:42:30Z), Grace Hopper, 2))",10,Everyone's favorite pancakes,"List(Mix milk and vinegar in a medium bowl let set for 5 minutes to ""sour""., Mix flour, sugar, baking powder, baking soda, and salt in a large mixing bowl., Combine butter egg into milk., Pour dry mixture into the wet ingredients and whisk until smooth., Heat electric skillet to 435 F and coat with cooking spray., Pour 1/4 cup of batter for each pancake onto the skillet, and cook until bubbles appear on the sides., Flip and cook until browned.)",,"List(List(List(0.75, cup), milk, null), List(List(1.5, tablespoon), white vinegar, null), List(List(0.5, teaspoon), baking soda, null), List(List(0.5, teaspoon), salt, null), List(List(2.0, tablespoon), granulated sugar, null), List(List(1.0, teaspoon), baking powder, null), List(List(1.0, cup), all-purpose flour, null), List(List(1.0, null), egg, null), List(List(2.5, tablespoon), butter, null))","List(261, 415, 1, 35, 75)",4.0,10,"List(5, 4, 3, 5, 4, 5, 3, 1, 5)",3.88,4,,Pancakes,Breakfast
List(5e87856d07beb474c074c5ca),273,,60,A sweet twist on traditional meatloaf,"List(Preheat oven to 350 F and lightly grease a 5x9 inch loaf pan., Spread and press brown sugar to the bottom of loaf pan and spread ketchup over the sugar., In a mixing bowl, mix remaining ingredients, Shape and press into a loaf pan (on top of the ketchup and sugar)., Bake for 1 hour.)",,"List(List(List(0.5, cup), packed brown sugar, null), List(List(0.5, cup), ketchup, null))","List(415, 2, 75)",3.0,12,"List(4, 1, 4, 4)",3.25,7,"List(groud beef, family meal, easy)",Brown Sugar Meatloaf,Dinner
List(5e878f5220a4f574c0aa56db),286,,20,Wonderful smoked salon with maple wood,"List(Prep smoker with maple, Scale and prep fish with maple syrup, Smoke / grill until done.)",,"List(List(List(3.0, lbs), salmon, null), List(List(0.5, cup), maple syrup, null))",List(1),1.0,15,,,4,"List(fish, smoked, grill)",Maple Smoked Salmon,Dinner
List(5edf1cd43260aab97ea0d588),384,"List(List(Solid apple pie! Well I don't mean it is actually solid I mean it is really good ... it's actually both soft and crusty. I don't mean like bad crusty, I mean like the crust is ... oh nevermind., List(2020-09-07T18:42:30Z), Caderyn Jenkins, 1), List(Mine is better, but this pretty good., List(2000-02-03T18:42:30Z), Grace Hopper, 2))",45,All American 🍏 pie,"List(Preheat oven to 425 F, Put bottom crust in pan place in refrigerator, Peal, decore and chop apple into 1/2 to 1 inch pieces (or slice into thin pieces), Mix flour with sugar and spices, Mix apples with flour/spice/sugar and juice from lemon, Pour into pan with bottom crust and optionally slice a few pieces of button on top, Place top crust on top and pinch sides. Cut small slits into middle of top., Cover edges with foil around top of crust and remove durning last 20 mintues., Bake 45 mintues.)",,"List(List(List(2.0, null), pie crusts, null), List(List(6.0, null), granny smith apples, null), List(List(0.75, cup), granulated suger, null), List(List(1.0, tbsp), cinnamon, null), List(List(1.0, tps), nutmeg, null), List(List(1.0, null), lemon, null))","List(2, 1, 75)",3.0,25,"List(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5)",4.8,8,"List(traditional, 4th of july)",Apple Pie,Dessert
List(5edf1d313260aab97ea0d589),271,,32,"Don't worry, they won't turn out green","List(Combine brownie mix with other ingredients, Bake as it says on box, Don't actually make or eat this.)","List(List(List(List(2, tbsp), flour)), 5)","List(List(List(1.0, cup), butter, null), List(List(2.0, null), egg, null), List(List(0.75, cup), plain yogurt, null), List(List(0.75, cup), shredded zucchini, null), List(List(0.75, cup), chocolate chips (semisweet), null), List(List(3.0, lbs), creamy peanut butter, null), List(List(1.0, null), brownie mix, null))",,,12,"List(1, 1, 1, 1, 5)",1.8,12,"List(sweets, easy)",Zucchini Brownies,Dessert
List(5e5e9c470d33e9e8e3891b35),210,,20,Classic Mexican tacos,"List(Brown beef, Add taco seasoning and water, mix, Bring to boil, Lower heat to simmer 5-10 minutes until desired consistency, Put meat in tacos)",,"List(List(null, ground beef (lean), List(1, lbs)), List(null, taco seasoning, List(2, oz)), List(null, corn hard tacos, List(12, oz)))","List(1, 415)",2.0,10,"List(4, 4, 3, 4, 2, 5, 2, 2, 4, 5)",3.5,4,"List(mexican, quick, easy, ground beef)",Tacos,Dinner


root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- calories_per_serving: long (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- body: string (nullable = true)
 |    |    |-- date: struct (nullable = true)
 |    |    |    |-- $date: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |-- cook_time: long (nullable = true)
 |-- desc: string (nullable = true)
 |-- directions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- high_alt: struct (nullable = true)
 |    |-- add_ingredients: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- amount: struct (nullable = true)
 |    |    |    |    |-- quantity: string (nullable = true)
 |    |    |    |    |-- unit: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- add_tim

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, MapType,ArrayType,DoubleType,LongType
from pyspark.sql.functions import col
schema = StructType([
    StructField("title",StringType(), True),
    StructField(
        'ingredients', 
         ArrayType(
           StructType([ StructField("amount" , ( StructType([StructField('quantity', DoubleType(), True),StructField('unit', StringType(), True)]) ), True),
            StructField('name_meal', StringType(), True),
            StructField("quantity" , ( StructType([StructField('amount', LongType(), True),StructField('unit', StringType(), True)]) ), True)
                      ])
        )
    )
  ] )
df2=data_df[["title","ingredients"]].toPandas().values.tolist()

df = spark.createDataFrame(data=df2, schema = schema)
display(df)
display(df.select( col("title"), col("ingredients").name_meal, col("ingredients").amount.unit,col("ingredients").amount.quantity ) )

title,ingredients
Chicken Soft Tacos,"List(List(null, chicken breast, List(1, lbs)), List(null, taco seasoning, List(2, oz)), List(null, small flour totillas, List(12, oz)))"
Pancakes,"List(List(List(0.75, cup), milk, null), List(List(1.5, tablespoon), white vinegar, null), List(List(0.5, teaspoon), baking soda, null), List(List(0.5, teaspoon), salt, null), List(List(2.0, tablespoon), granulated sugar, null), List(List(1.0, teaspoon), baking powder, null), List(List(1.0, cup), all-purpose flour, null), List(List(1.0, null), egg, null), List(List(2.5, tablespoon), butter, null))"
Brown Sugar Meatloaf,"List(List(List(0.5, cup), packed brown sugar, null), List(List(0.5, cup), ketchup, null))"
Maple Smoked Salmon,"List(List(List(3.0, lbs), salmon, null), List(List(0.5, cup), maple syrup, null))"
Apple Pie,"List(List(List(2.0, null), pie crusts, null), List(List(6.0, null), granny smith apples, null), List(List(0.75, cup), granulated suger, null), List(List(1.0, tbsp), cinnamon, null), List(List(1.0, tps), nutmeg, null), List(List(1.0, null), lemon, null))"
Zucchini Brownies,"List(List(List(1.0, cup), butter, null), List(List(2.0, null), egg, null), List(List(0.75, cup), plain yogurt, null), List(List(0.75, cup), shredded zucchini, null), List(List(0.75, cup), chocolate chips (semisweet), null), List(List(3.0, lbs), creamy peanut butter, null), List(List(1.0, null), brownie mix, null))"
Tacos,"List(List(null, ground beef (lean), List(1, lbs)), List(null, taco seasoning, List(2, oz)), List(null, corn hard tacos, List(12, oz)))"


title,ingredients.name_meal,ingredients.amount.unit,ingredients.amount.quantity
Chicken Soft Tacos,"List(chicken breast, taco seasoning, small flour totillas)","List(null, null, null)","List(null, null, null)"
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)","List(cup, tablespoon, teaspoon, teaspoon, tablespoon, teaspoon, cup, null, tablespoon)","List(0.75, 1.5, 0.5, 0.5, 2.0, 1.0, 1.0, 1.0, 2.5)"
Brown Sugar Meatloaf,"List(packed brown sugar, ketchup)","List(cup, cup)","List(0.5, 0.5)"
Maple Smoked Salmon,"List(salmon, maple syrup)","List(lbs, cup)","List(3.0, 0.5)"
Apple Pie,"List(pie crusts, granny smith apples, granulated suger, cinnamon, nutmeg, lemon)","List(null, null, cup, tbsp, tps, null)","List(2.0, 6.0, 0.75, 1.0, 1.0, 1.0)"
Zucchini Brownies,"List(butter, egg, plain yogurt, shredded zucchini, chocolate chips (semisweet), creamy peanut butter, brownie mix)","List(cup, null, cup, cup, cup, lbs, null)","List(1.0, 2.0, 0.75, 0.75, 0.75, 3.0, 1.0)"
Tacos,"List(ground beef (lean), taco seasoning, corn hard tacos)","List(null, null, null)","List(null, null, null)"


In [0]:
# explode function convert array elements into rows
df2=df.select(col("title"),col("ingredients").name_meal,explode(col("ingredients").name_meal))

display(df2)

title,ingredients.name_meal,col
Chicken Soft Tacos,"List(chicken breast, taco seasoning, small flour totillas)",chicken breast
Chicken Soft Tacos,"List(chicken breast, taco seasoning, small flour totillas)",taco seasoning
Chicken Soft Tacos,"List(chicken breast, taco seasoning, small flour totillas)",small flour totillas
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",milk
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",white vinegar
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",baking soda
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",salt
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",granulated sugar
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",baking powder
Pancakes,"List(milk, white vinegar, baking soda, salt, granulated sugar, baking powder, all-purpose flour, egg, butter)",all-purpose flour
