In [1]:
from pyspark.sql import SparkSession

In [2]:
spark_session = SparkSession.builder.appName("data_operations").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/08 09:07:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read the dataset
df = spark_session.read.csv("./data/pii_data.csv", header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Contact: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
df.show(2)

+----------+---+------+-----------------+----------------+------+
|      Name|Age|   Sex|          Address|         Contact|Salary|
+----------+---+------+-----------------+----------------+------+
|  John Doe| 30|  Male|123 Main St, City|john@example.com| 50000|
|Jane Smith| 25|Female| 456 Elm St, Town|jane@example.com| 45000|
+----------+---+------+-----------------+----------------+------+
only showing top 2 rows



### Filter Operation
- `>`, `<`, `<>`, `=`
- `&`, `|`, `~`

In [6]:
# Filter the dataframe in such a way that employees having a salary greater than 30000 are considered
df.filter("Salary>30000").show()

+-----------------+---+------+--------------------+--------------------+------+
|             Name|Age|   Sex|             Address|             Contact|Salary|
+-----------------+---+------+--------------------+--------------------+------+
|         John Doe| 30|  Male|   123 Main St, City|    john@example.com| 50000|
|       Jane Smith| 25|Female|    456 Elm St, Town|    jane@example.com| 45000|
|   Robert Johnson| 35|  Male| 789 Oak St, Village|  robert@example.com| 60000|
|      Emily Davis| 28|Female| 101 Pine St, Hamlet|   emily@example.com| 55000|
|    Michael Brown| 40|  NULL|222 Cedar St, County| michael@example.com| 70000|
|     James Miller| 29|  Male|444 Birch St, Tow...|   james@example.com| 52000|
|      Lisa Garcia| 27|  NULL|555 Oakwood St, Town|    lisa@example.com| 48000|
|    Olivia Taylor| 31|Female|777 Elmwood St, C...|  olivia@example.com| 57000|
|    Sophia Harris| 26|Female|999 Cedarwood St,...|  sophia@example.com| 47000|
|Christopher Davis| 34|  Male|111 Oaksid

In [7]:
# Alternative method
df.filter(df["Salary"]>30000).show()

+-----------------+---+------+--------------------+--------------------+------+
|             Name|Age|   Sex|             Address|             Contact|Salary|
+-----------------+---+------+--------------------+--------------------+------+
|         John Doe| 30|  Male|   123 Main St, City|    john@example.com| 50000|
|       Jane Smith| 25|Female|    456 Elm St, Town|    jane@example.com| 45000|
|   Robert Johnson| 35|  Male| 789 Oak St, Village|  robert@example.com| 60000|
|      Emily Davis| 28|Female| 101 Pine St, Hamlet|   emily@example.com| 55000|
|    Michael Brown| 40|  NULL|222 Cedar St, County| michael@example.com| 70000|
|     James Miller| 29|  Male|444 Birch St, Tow...|   james@example.com| 52000|
|      Lisa Garcia| 27|  NULL|555 Oakwood St, Town|    lisa@example.com| 48000|
|    Olivia Taylor| 31|Female|777 Elmwood St, C...|  olivia@example.com| 57000|
|    Sophia Harris| 26|Female|999 Cedarwood St,...|  sophia@example.com| 47000|
|Christopher Davis| 34|  Male|111 Oaksid

In [8]:
# get only the name of employees having salary greater than 30000
df.filter("Salary>30000").select(["Name", "Salary"]).show()

+-----------------+------+
|             Name|Salary|
+-----------------+------+
|         John Doe| 50000|
|       Jane Smith| 45000|
|   Robert Johnson| 60000|
|      Emily Davis| 55000|
|    Michael Brown| 70000|
|     James Miller| 52000|
|      Lisa Garcia| 48000|
|    Olivia Taylor| 57000|
|    Sophia Harris| 47000|
|Christopher Davis| 61000|
|    Emma Anderson| 44000|
|     Joseph Smith| 63000|
|      Mia Johnson| 49000|
|     David Wilson| 64000|
|        Ava Brown| 46000|
|  Benjamin Taylor| 65000|
|    Charlotte Lee| 50000|
+-----------------+------+



In [9]:
# Get data of employees having salary greater than 30000 but less than 50000
df.filter((df["Salary"]>30000) & (df["Salary"]<50000)).show()

+-------------+---+------+--------------------+------------------+------+
|         Name|Age|   Sex|             Address|           Contact|Salary|
+-------------+---+------+--------------------+------------------+------+
|   Jane Smith| 25|Female|    456 Elm St, Town|  jane@example.com| 45000|
|  Lisa Garcia| 27|  NULL|555 Oakwood St, Town|  lisa@example.com| 48000|
|Sophia Harris| 26|Female|999 Cedarwood St,...|sophia@example.com| 47000|
|Emma Anderson| 24|Female|222 Elmridge St, ...|  emma@example.com| 44000|
|  Mia Johnson| 30|Female|444 Maplewood St,...|   mia@example.com| 49000|
|    Ava Brown| 28|Female|666 Oakridge St, ...|   ava@example.com| 46000|
+-------------+---+------+--------------------+------------------+------+



In [10]:
# Employees having salary greater than 30000 and less than 50000 or age greater than 30
df.filter((df["Salary"]>30000) & (df["Salary"]<50000) | (df["Age"]>30)).show()

+-----------------+---+------+--------------------+--------------------+------+
|             Name|Age|   Sex|             Address|             Contact|Salary|
+-----------------+---+------+--------------------+--------------------+------+
|       Jane Smith| 25|Female|    456 Elm St, Town|    jane@example.com| 45000|
|   Robert Johnson| 35|  Male| 789 Oak St, Village|  robert@example.com| 60000|
|    Michael Brown| 40|  NULL|222 Cedar St, County| michael@example.com| 70000|
|     Sarah Wilson| 33|Female|333 Maple St, Suburb|   sarah@example.com|  NULL|
|      Lisa Garcia| 27|  NULL|555 Oakwood St, Town|    lisa@example.com| 48000|
|      William Lee| 38|  NULL|666 Pinecrest St,...| william@example.com|  NULL|
|    Olivia Taylor| 31|Female|777 Elmwood St, C...|  olivia@example.com| 57000|
|    Sophia Harris| 26|Female|999 Cedarwood St,...|  sophia@example.com| 47000|
|Christopher Davis| 34|  Male|111 Oakside St, V...|   chris@example.com| 61000|
|    Emma Anderson| 24|Female|222 Elmrid

In [11]:
# not operation
df.filter(~(df["Salary"]>30000) & (df["Salary"]<50000) | (df["Age"]>30)).show()

+-----------------+---+------+--------------------+--------------------+------+
|             Name|Age|   Sex|             Address|             Contact|Salary|
+-----------------+---+------+--------------------+--------------------+------+
|   Robert Johnson| 35|  Male| 789 Oak St, Village|  robert@example.com| 60000|
|    Michael Brown| 40|  NULL|222 Cedar St, County| michael@example.com| 70000|
|     Sarah Wilson| 33|Female|333 Maple St, Suburb|   sarah@example.com|  NULL|
|      William Lee| 38|  NULL|666 Pinecrest St,...| william@example.com|  NULL|
|    Olivia Taylor| 31|Female|777 Elmwood St, C...|  olivia@example.com| 57000|
|Christopher Davis| 34|  Male|111 Oakside St, V...|   chris@example.com| 61000|
|     Joseph Smith| 37|  Male|333 Birchwood St,...|  joseph@example.com| 63000|
|     David Wilson| 36|  Male|555 Pinehurst St,...|   david@example.com| 64000|
|  Benjamin Taylor| 39|  Male|777 Cedarhurst St...|benjamin@example.com| 65000|
+-----------------+---+------+----------