# Filter data from Dataframe

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [3]:
arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]

In [9]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])

In [17]:
df=spark.createDataFrame(data=arrayStructureData,schema=arrayStructureSchema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [19]:
df.show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



TYPE 1

In [20]:
df.filter(df.gender == "F").show(truncate=False)

+-------------------+------------------+-----+------+
|name               |languages         |state|gender|
+-------------------+------------------+-----+------+
|[Anna, Rose, ]     |[Spark, Java, C++]|NY   |F     |
|[Julia, , Williams]|[CSharp, VB]      |OH   |F     |
+-------------------+------------------+-----+------+



TYPE 2

In [22]:
from pyspark.sql.functions import col
df.filter(col("state") == "OH") \
    .show(truncate=False) 

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Julia, , Williams]   |[CSharp, VB]      |OH   |F     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



TYPE 3

In [23]:
df.filter("gender == 'F'") \
    .show(truncate=False) 

+-------------------+------------------+-----+------+
|name               |languages         |state|gender|
+-------------------+------------------+-----+------+
|[Anna, Rose, ]     |[Spark, Java, C++]|NY   |F     |
|[Julia, , Williams]|[CSharp, VB]      |OH   |F     |
+-------------------+------------------+-----+------+



MULTIPLE CONDIOTIONS

In [26]:
df.filter( (df.state  == "OH") & (df.gender  == "M")).show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith]      |[Java, Scala, C++]|OH   |M     |
|[Mike, Mary, Williams]|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



FILTER ON ARRAY

In [28]:
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages,"Python")).show(truncate=False) 

+----------------------+------------+-----+------+
|name                  |languages   |state|gender|
+----------------------+------------+-----+------+
|[Mike, Mary, Williams]|[Python, VB]|OH   |M     |
+----------------------+------------+-----+------+



In [30]:
df.filter(df.name.firstname=="Julia").show(truncate=False)

+-------------------+------------+-----+------+
|name               |languages   |state|gender|
+-------------------+------------+-----+------+
|[Julia, , Williams]|[CSharp, VB]|OH   |F     |
+-------------------+------------+-----+------+



In [None]:
SORT