In [3]:
#Horizontal Partitioning
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HorizontalPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)

# Horizontal partitioning by a  city column
partitioned_df = df.repartition("city")


partitioned_df.show()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2017|Bangalore|          3| 34|  Male|         No|                        0|         0|
|  Masters|       2016|Bangalore|          3| 27|  Male|         No|                        5|         1|
|Bachelors|       2016|Bangalore|          3| 22|  Male|         No|                        0|         0|
|Bachelors|       2016|Bangalore|          3| 34|Female|         No|                        2|         1|
|  Masters|       2012|Bangalore|          3| 27|  Male|         No|                        5|         1|
|Bachelors|       2016|Bangalore|          3| 39|  Male|         No|                        2|         0|
|Bachelors|       2012|Bangalore|          3| 

In [4]:
#Vertical Partitioning:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("VerticalPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)

partitioned_df = df.select("ExperienceInCurrentDomain", "Education")

partitioned_df.show()

24/05/06 18:21:55 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-------------------------+---------+
|ExperienceInCurrentDomain|Education|
+-------------------------+---------+
|                        0|Bachelors|
|                        3|Bachelors|
|                        2|Bachelors|
|                        5|  Masters|
|                        2|  Masters|
|                        0|Bachelors|
|                        0|Bachelors|
|                        2|Bachelors|
|                        1|Bachelors|
|                        2|  Masters|
|                        5|  Masters|
|                        3|Bachelors|
|                        5|Bachelors|
|                        2|Bachelors|
|                        4|Bachelors|
|                        3|Bachelors|
|                        2|Bachelors|
|                        4|Bachelors|
|                        0|Bachelors|
|                        0|Bachelors|
+-------------------------+---------+
only showing top 20 rows



In [5]:
#Key-based Partitioning:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("KeyBasedPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)

# Key-based partitioning by a specific column
partitioned_df = df.repartition("ExperienceInCurrentDomain")

partitioned_df.show()

# Stop the SparkSession
spark.stop()

24/05/06 18:25:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2016|Bangalore|          3| 39|  Male|         No|                        7|         0|
|Bachelors|       2016|     Pune|          3| 38|Female|         No|                        7|         0|
|  Masters|       2014|Bangalore|          3| 40|Female|         No|                        7|         1|
|Bachelors|       2014|Bangalore|          3| 39|  Male|         No|                        7|         1|
|Bachelors|       2012|Bangalore|          3| 38|  Male|         No|                        7|         1|
|Bachelors|       2012|Bangalore|          1| 35|  Male|         No|                        7|         0|
|Bachelors|       2015|     Pune|          3| 

In [8]:
#Range-based Partitioning:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RangeBasedPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)
partitioned_df = df.repartitionByRange(2012, "JoiningYear")

partitioned_df.show()

# Stop the SparkSession
spark.stop()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|  Masters|       2012|Bangalore|          3| 27|  Male|         No|                        5|         1|
|Bachelors|       2012|Bangalore|          3| 37|  Male|         No|                        4|         0|
|Bachelors|       2012|Bangalore|          3| 37|  Male|         No|                        0|         0|
|Bachelors|       2012|New Delhi|          3| 29|  Male|         No|                        3|         0|
|      PHD|       2012|New Delhi|          3| 27|  Male|         No|                        5|         0|
|Bachelors|       2012|Bangalore|          3| 39|  Male|         No|                        1|         0|
|  Masters|       2012|New Delhi|          3| 

In [9]:
#Hash-based Partitioning:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HashBasedPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)
partitioned_df = df.repartition("Education")
partitioned_df.show()
spark.stop()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|  Masters|       2016|Bangalore|          3| 27|  Male|         No|                        5|         1|
|  Masters|       2017|     Pune|          3| 24|  Male|        Yes|                        2|         1|
|  Masters|       2017|New Delhi|          2| 37|  Male|         No|                        2|         0|
|  Masters|       2012|Bangalore|          3| 27|  Male|         No|                        5|         1|
|  Masters|       2017|New Delhi|          2| 28|  Male|         No|                        4|         0|
|  Masters|       2017|New Delhi|          2| 30|Female|         No|                        2|         0|
|  Masters|       2017|New Delhi|          2| 

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("RoundRobinPartitioningExample") \
    .getOrCreate()

# Load dataset
df = spark.read.csv("Employee.csv", header=True)

# Round-robin partitioning
partitioned_df = df.repartition(3)
partitioned_df.show()

# Stop the SparkSession
spark.stop()

+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Education|JoiningYear|     City|PaymentTier|Age|Gender|EverBenched|ExperienceInCurrentDomain|LeaveOrNot|
+---------+-----------+---------+-----------+---+------+-----------+-------------------------+----------+
|Bachelors|       2015|     Pune|          2| 29|Female|         No|                        1|         1|
|  Masters|       2015|     Pune|          2| 30|Female|         No|                        1|         0|
|Bachelors|       2017|     Pune|          2| 33|Female|         No|                        2|         1|
|Bachelors|       2018|Bangalore|          3| 28|Female|         No|                        3|         1|
|  Masters|       2017|New Delhi|          3| 41|  Male|         No|                        3|         1|
|Bachelors|       2014|     Pune|          3| 27|  Male|         No|                        5|         1|
|      PHD|       2013|Bangalore|          3| 