# Partitioning practice to see the impact on execution time

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("partitioning_session").getOrCreate()

24/09/02 08:10:34 WARN Utils: Your hostname, manu-pc resolves to a loopback address: 127.0.1.1; using 192.168.34.41 instead (on interface wlp58s0)
24/09/02 08:10:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 08:10:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_file_path = r"../data/employees.csv"

df = spark.read.csv(data_file_path,inferSchema = True, header = True)
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|employee_id|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|          NULL|       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|          NULL|       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|          NULL|       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|          NULL|       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|          NULL|       201|           20|


## aggregation without partiotioning execution time

In [14]:
from time import sleep, time
from pyspark.sql.functions import sum,col

start_time = time()
print(start_time)

df.groupBy('DEPARTMENT_ID').agg(sum(col('SALARY'))).show()

end_time = time()

print(end_time)
print("exec_time: ", end_time-start_time)

1725245591.3734844
+-------------+-----------+
|DEPARTMENT_ID|sum(SALARY)|
+-------------+-----------+
|           20|      19000|
|           40|       6500|
|          100|      51608|
|           10|       4400|
|           50|      85600|
|           70|      10000|
|           90|      58000|
|           60|      28800|
|          110|      20308|
|           30|      24900|
+-------------+-----------+

1725245592.3519967
exec_time:  0.9785122871398926


## aggregation with partition execution time 


In [25]:
from time import sleep, time
from pyspark.sql.functions import sum,col

df = df.repartition("DEPARTMENT_ID")

start_time = time()
print(start_time)

# df.groupBy('DEPARTMENT_ID').agg(sum(col('SALARY'))).show()
df.groupBy("DEPARTMENT_ID").agg(sum('SALARY'),avg('SALARY')).show() # WORKING


end_time = time()

print(end_time)
print("exec_time: ", end_time-start_time)

1725246162.8037558
+-------------+-----------+------------------+
|DEPARTMENT_ID|sum(SALARY)|       avg(SALARY)|
+-------------+-----------+------------------+
|           20|      19000|            9500.0|
|           40|       6500|            6500.0|
|          100|      51608| 8601.333333333334|
|           10|       4400|            4400.0|
|           50|      85600|3721.7391304347825|
|           70|      10000|           10000.0|
|           90|      58000|19333.333333333332|
|           60|      28800|            5760.0|
|          110|      20308|           10154.0|
|           30|      24900|            4150.0|
+-------------+-----------+------------------+

1725246163.0189307
exec_time:  0.21517491340637207


In [24]:
## aggregation trial
from pyspark.sql.functions import avg


# df.groupBy("DEPARTMENT_ID").sum('SALARY').show() # working
# df.groupBy("DEPARTMENT_ID").agg({'SALARY':'sum'}).show() #  WORKING
# df.groupBy("DEPARTMENT_ID").agg({'SALARY':'sum'}).show() #  WORKING
df.groupBy("DEPARTMENT_ID").agg(sum('SALARY'),avg('SALARY')).show() # WORKING

#

# df.groupBy("DEPARTMENT_ID").sum('SALARY').avg('SALARY').show() # NOT WORKING
# df.groupBy("DEPARTMENT_ID").agg('SALARY',sum('SALARY')).show() # NOT WORKING








+-------------+-----------+------------------+
|DEPARTMENT_ID|sum(SALARY)|       avg(SALARY)|
+-------------+-----------+------------------+
|           20|      19000|            9500.0|
|           40|       6500|            6500.0|
|          100|      51608| 8601.333333333334|
|           10|       4400|            4400.0|
|           50|      85600|3721.7391304347825|
|           70|      10000|           10000.0|
|           90|      58000|19333.333333333332|
|           60|      28800|            5760.0|
|          110|      20308|           10154.0|
|           30|      24900|            4150.0|
+-------------+-----------+------------------+



## join without partition

In [26]:
# Create a large DataFrame of transactions
transactions_data = [(1, 100, '2023-01-01'),
                     (2, 150, '2023-01-02'),
                     (1, 200, '2023-01-03'),
                     (3, 250, '2023-01-04'),
                     (2, 300, '2023-01-05'),
                     (4, 350, '2023-01-06'),
                     (5, 400, '2023-01-07'),
                     (3, 450, '2023-01-08'),
                     (6, 500, '2023-01-09'),
                     (7, 550, '2023-01-10')]

transactions_columns = ["user_id", "amount", "transaction_date"]
transactions_df = spark.createDataFrame(transactions_data, transactions_columns)

# Create a small DataFrame of user information
users_data = [(1, 'Alice'),
              (2, 'Bob'),
              (3, 'Charlie'),
              (4, 'David'),
              (5, 'Eve')]

users_columns = ["user_id", "name"]
users_df = spark.createDataFrame(users_data, users_columns)


In [33]:
from time import sleep, time
from pyspark.sql.functions import sum,col

start_time = time()
print(start_time)

users_df.join(transactions_df, 'user_id').show()

end_time = time()

print(end_time)
print("exec_time: ", end_time-start_time)

1725246803.070835
+-------+-------+------+----------------+
|user_id|   name|amount|transaction_date|
+-------+-------+------+----------------+
|      1|  Alice|   100|      2023-01-01|
|      1|  Alice|   200|      2023-01-03|
|      2|    Bob|   150|      2023-01-02|
|      2|    Bob|   300|      2023-01-05|
|      3|Charlie|   250|      2023-01-04|
|      3|Charlie|   450|      2023-01-08|
|      4|  David|   350|      2023-01-06|
|      5|    Eve|   400|      2023-01-07|
+-------+-------+------+----------------+

1725246803.9494195
exec_time:  0.8785843849182129


                                                                                

## join with partition

In [38]:
from time import sleep, time
from pyspark.sql.functions import sum,col

users_df = users_df.repartition("user_id")
transactions_df = transactions_df.repartition("user_id")


start_time = time()
print(start_time)

users_df.join(transactions_df, 'user_id').show()



end_time = time()

print(end_time)
print("exec_time: ", end_time-start_time)

1725246927.7953131
+-------+-------+------+----------------+
|user_id|   name|amount|transaction_date|
+-------+-------+------+----------------+
|      1|  Alice|   100|      2023-01-01|
|      1|  Alice|   200|      2023-01-03|
|      2|    Bob|   150|      2023-01-02|
|      2|    Bob|   300|      2023-01-05|
|      3|Charlie|   250|      2023-01-04|
|      3|Charlie|   450|      2023-01-08|
|      4|  David|   350|      2023-01-06|
|      5|    Eve|   400|      2023-01-07|
+-------+-------+------+----------------+

1725246928.5160182
exec_time:  0.7207050323486328


In [39]:
# Trying on Big data set

In [41]:
employee_file_path = r"/media/manu/sec_storage/data_set/employee_data/employee_data.csv"
recruitment_file_path = r"/media/manu/sec_storage/data_set/employee_data/recruitment_data.csv"

employee_df = spark.read.csv(employee_file_path, inferSchema = True, header=True)
recruitment_df = spark.read.csv(recruitment_file_path, inferSchema = True, header=True)



In [57]:
employee_df.count()

3000

In [58]:
recruitment_df.count()

3000

In [42]:
employee_df.show()

24/09/02 09:42:55 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----+-----------+--------+---------+---------+--------------------+-----------------+--------------------+------------+--------------+------------+-------+--------------------------+---------------+----------------------+-----------------+--------------------+----------+-----+----------------------+----------+------------+--------+-----------+-----------------+-----------------------+
|EmpID|  FirstName|LastName|StartDate| ExitDate|               Title|       Supervisor|             ADEmail|BusinessUnit|EmployeeStatus|EmployeeType|PayZone|EmployeeClassificationType|TerminationType|TerminationDescription|   DepartmentType|            Division|       DOB|State|JobFunctionDescription|GenderCode|LocationCode|RaceDesc|MaritalDesc|Performance Score|Current Employee Rating|
+-----+-----------+--------+---------+---------+--------------------+-----------------+--------------------+------------+--------------+------------+-------+--------------------------+---------------+--------------------

In [43]:
recruitment_df.show()

+------------+----------------+-----------+----------+------+-------------+--------------------+--------------------+--------------------+-------------------+-----+--------+--------------------+-----------------+-------------------+--------------+--------------------+------------+
|Applicant ID|Application Date| First Name| Last Name|Gender|Date of Birth|        Phone Number|               Email|             Address|               City|State|Zip Code|             Country|  Education Level|Years of Experience|Desired Salary|           Job Title|      Status|
+------------+----------------+-----------+----------+------+-------------+--------------------+--------------------+--------------------+-------------------+-----+--------+--------------------+-----------------+-------------------+--------------+--------------------+------------+
|        1001|       03-Jun-23|      Scott|  Sheppard|  Male|   31-08-1992|  421-429-7655x39421|perezjanet@exampl...|     597 Smith Point|        Hollandf

In [75]:
# state wise partition join and execution time find

from time import sleep, time
from pyspark.sql.functions import sum,col

start_time = time()
print(start_time)

employee_df.join(recruitment_df, employee_df['FirstName']==recruitment_df['First Name'])

end_time = time()

print(end_time)
print("without partition exec_time: ", end_time-start_time)

# applying reprtitioning

employee_df2 = employee_df.repartition('State')
recruitment_df2 = recruitment_df.repartition('state')

start_time2 = time()
print(start_time2)

employee_df2.join(recruitment_df2, employee_df2['FirstName']==recruitment_df2['First Name'])

end_time2 = time()
print(end_time2)
print("without partition exec_time: ", end_time2-start_time2)


1725256378.6074026
1725256378.62379
without partition exec_time:  0.016387462615966797
1725256378.6384308
1725256378.6503756
without partition exec_time:  0.011944770812988281
