In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql.types import StructType, StructField 
from pyspark.sql.types import StringType, IntegerType, ArrayType
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[4]"))
spark = SparkSession(sc)

In [2]:
arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]

In [3]:
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])

In [4]:
df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [5]:
df.filter(df.state == "OH").show(truncate=False)
df.filter(~(df.state == "OH")).show(truncate=False)
df.filter(df.state != "OH").show(truncate=False)    
df.filter(col("state") == "OH").show(truncate=False)

df.where(df.state == "OH").show(truncate=False)
df.where(~(df.state == "OH")).show(truncate=False)
df.where(df.state != "OH").show(truncate=False)    
df.where(col("state") == "OH").show(truncate=False)    

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Julia, , Williams]   |[CSharp, VB]      |OH   |F     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|name                |languages         |state|gender|
+--------------------+------------------+-----+------+
|[Anna, Rose, ]      |[Spark, Java, C++]|NY   |F     |
|[Maria, Anne, Jones]|[CSharp, VB]      |NY   |M     |
|[Jen, Mary, Brown]  |[CSharp, VB]      |NY   |M     |
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|name                |languages         |state|gender|
+--------------------+------------------+-----+------+
|[Anna, Rose, ]      |[Spark, Java, C++]|NY   |F 

In [6]:
df.filter("gender  == 'M'").show(truncate=False) 
df.where("gender  == 'M'").show(truncate=False)  

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Maria, Anne, Jones]  |[CSharp, VB]      |NY   |M     |
|[Jen, Mary, Brown]    |[CSharp, VB]      |NY   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Maria, Anne, Jones]  |[CSharp, VB]      |NY   |M     |
|[Jen, Mary, Brown]    |[CSharp, VB]      |NY   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [7]:
df.filter( (df.state  == "OH") & (df.gender  == "M") ).show(truncate=False)
df.where( (df.state  == "OH") & (df.gender  == "M") ).show(truncate=False)   

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [8]:
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages,"Java")).show(truncate=False)        
df.filter(df.name.lastname == "Williams").show(truncate=False) 

+----------------+------------------+-----+------+
|name            |languages         |state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|OH   |M     |
|[Anna, Rose, ]  |[Spark, Java, C++]|NY   |F     |
+----------------+------------------+-----+------+

+----------------------+------------+-----+------+
|name                  |languages   |state|gender|
+----------------------+------------+-----+------+
|[Julia, , Williams]   |[CSharp, VB]|OH   |F     |
|[Mike, Mary, Williams]|[Python, VB]|OH   |M     |
+----------------------+------------+-----+------+



In [9]:
df.filter(df.name.lastname == "Williams").explain() 

== Physical Plan ==
*(1) Filter (isnotnull(name#0) AND (name#0.lastname = Williams))
+- *(1) Scan ExistingRDD[name#0,languages#1,state#2,gender#3]




In [10]:
df.filter(array_contains(df.languages,"Java")).explain()

== Physical Plan ==
*(1) Filter array_contains(languages#1, Java)
+- *(1) Scan ExistingRDD[name#0,languages#1,state#2,gender#3]




In [11]:
df.filter( (df.state  == "OH") & (df.gender  == "M") ).explain()

== Physical Plan ==
*(1) Filter (((isnotnull(state#2) AND isnotnull(gender#3)) AND (state#2 = OH)) AND (gender#3 = M))
+- *(1) Scan ExistingRDD[name#0,languages#1,state#2,gender#3]


