<a href="https://colab.research.google.com/github/mayureshpawashe/ad_spark/blob/main/ad_spark_day4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import urllib.request
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ad Spark day3 ').getOrCreate()
url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Employee%20data.csv"
file_path = "/tmp/EMP_data.csv"
urllib.request.urlretrieve(url, file_path)
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(10)

+----+------+----------+----+--------+-------+--------+-------+-------+--------+
|  id|gender|     bdate|educ|  jobcat| salary|salbegin|jobtime|prevexp|minority|
+----+------+----------+----+--------+-------+--------+-------+-------+--------+
| 1.0|  Male|1952-02-03|  15| Manager|57000.0| 27000.0|   98.0|  144.0|      No|
| 2.0|  Male|1958-05-23|  16|Clerical|40200.0| 18750.0|   98.0|   36.0|      No|
| 3.0|Female|1929-07-26|  12|Clerical|21450.0| 12000.0|   98.0|  381.0|      No|
| 4.0|Female|1947-04-15|   8|Clerical|21900.0| 13200.0|   98.0|  190.0|      No|
| 5.0|  Male|1955-02-09|  15|Clerical|45000.0| 21000.0|   98.0|  138.0|      No|
| 6.0|  Male|1958-08-22|  15|Clerical|32100.0| 13500.0|   98.0|   67.0|      No|
| 7.0|  Male|1956-04-26|  15|Clerical|36000.0| 18750.0|   98.0|  114.0|      No|
| 8.0|Female|1966-05-06|  12|Clerical|21900.0|  9750.0|   98.0|missing|      No|
| 9.0|Female|1946-01-23|  15|Clerical|27900.0| 12750.0|   98.0|  115.0|      No|
|10.0|Female|1946-02-13|  12

##Using cache()

In [3]:
# Cache the DataFrame
df.cache()

DataFrame[id: double, gender: string, bdate: date, educ: int, jobcat: string, salary: double, salbegin: double, jobtime: double, prevexp: string, minority: string]

In [4]:
# Perform an action before and after caching
import time
start_time = time.time()
df.count()  # Action to trigger computation
print("Time before caching:", time.time() - start_time)

start_time = time.time()
df.count()  # This time, data is read from cache
print("Time after caching:", time.time() - start_time)

Time before caching: 1.4852211475372314
Time after caching: 0.45963573455810547


##Using persist()

In [5]:
from pyspark import StorageLevel

# Persist DataFrame in memory and disk
df.persist(StorageLevel.MEMORY_AND_DISK)

# Measure time before persisting
start_time = time.time()
df.count()  # Trigger an action
print("Time before persisting:", time.time() - start_time)

# Measure time after persisting
start_time = time.time()
df.count()  # Data is now read from persisted storage
print("Time after persisting:", time.time() - start_time)


Time before persisting: 0.2648754119873047
Time after persisting: 0.2920198440551758


#🚀 persist() can be faster than cache()

##Repartitioning and Coalescing

In [7]:
# Check the current number of partitions
print("Initial Partitions:", df.rdd.getNumPartitions())

# Increase the number of partitions
df_repartitioned = df.repartition(5)
print("Partitions after repartitioning:", df_repartitioned.rdd.getNumPartitions())

# Reduce the number of partitions using coalesce (efficient for downsizing)
df_coalesced = df_repartitioned.coalesce(2)
print("Partitions after coalescing:", df_coalesced.rdd.getNumPartitions())

Initial Partitions: 1
Partitions after repartitioning: 5
Partitions after coalescing: 2


In [11]:
data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000),
        ("James", "Sales", 3000),
        ("Scott", "Finance", 3300),
        ("Jen", "Finance", 3900),
        ("Jeff", "Marketing", 3000),
        ("Kumar", "Marketing", 2000),
        ("Saif", "Sales", 4100)]
columns = ["Employee_Name", "Department", "Salary"]
df1 = spark.createDataFrame(data=data, schema=columns)

# Original number of partitions
original_partitions = df.rdd.getNumPartitions()
print(f"Original number of partitions: {original_partitions}")

# Repartition to increase the number of partitions
repartitioned_df = df.repartition(6)
new_partitions = repartitioned_df.rdd.getNumPartitions()
print(f"New number of partitions after repartition: {new_partitions}")

Original number of partitions: 1
New number of partitions after repartition: 6


##Common Optimization Techniques for PySpark DataFrames

###Use select() Instead of * (Column Pruning)

In [10]:
#df_all = df.select("*").show()  # Avoid selecting unnecessary columns

df_selected = df.select("id", "salary", "jobcat").show()

+----+--------+--------+
|  id|  salary|  jobcat|
+----+--------+--------+
| 1.0| 57000.0| Manager|
| 2.0| 40200.0|Clerical|
| 3.0| 21450.0|Clerical|
| 4.0| 21900.0|Clerical|
| 5.0| 45000.0|Clerical|
| 6.0| 32100.0|Clerical|
| 7.0| 36000.0|Clerical|
| 8.0| 21900.0|Clerical|
| 9.0| 27900.0|Clerical|
|10.0| 24000.0|Clerical|
|11.0| 30300.0|Clerical|
|12.0| 28350.0|Clerical|
|13.0| 27750.0|Clerical|
|14.0| 35100.0|Clerical|
|15.0| 27300.0|Clerical|
|16.0| 40800.0|Clerical|
|17.0| 46000.0|Clerical|
|18.0|103750.0| Manager|
|19.0| 42300.0|Clerical|
|20.0| 26250.0|Clerical|
+----+--------+--------+
only showing top 20 rows



##Filter Data Early (Predicate Pushdown)

In [14]:
#df_filtered = df.select("*").filter(df.salary > 50000)  # Delayed filtering

df_filtered = df.filter(df.salary > 50000).show(5)  # Push filter early

+----+------+----------+----+-------+--------+--------+-------+-------+--------+
|  id|gender|     bdate|educ| jobcat|  salary|salbegin|jobtime|prevexp|minority|
+----+------+----------+----+-------+--------+--------+-------+-------+--------+
| 1.0|  Male|1952-02-03|  15|Manager| 57000.0| 27000.0|   98.0|  144.0|      No|
|18.0|  Male|1956-03-20|  16|Manager|103750.0| 27510.0|   97.0|   70.0|      No|
|27.0|  Male|1954-03-19|  19|Manager| 60375.0| 27480.0|   96.0|   96.0|      No|
|29.0|  Male|1944-01-28|  19|Manager|135000.0| 79980.0|   96.0|  199.0|      No|
|32.0|  Male|1954-01-28|  19|Manager|110625.0| 45000.0|   96.0|  120.0|      No|
+----+------+----------+----+-------+--------+--------+-------+-------+--------+
only showing top 5 rows



##Use cache() or persist() for Reused DataFrames

In [15]:
#df.count()  # Without persist, every action recomputes the DataFrame
#df.show()
df.persist()  # Default is MEMORY_AND_DISK
df.count()  # Triggers execution and stores result

474

##Avoid UDFs (Use Built-in Functions Instead)

In [16]:
from pyspark.sql.functions import when, col

df = df.withColumn("high_salary", when(col("salary") > 50000, "Yes").otherwise("No")).show()


+----+------+----------+----+--------+--------+--------+-------+-------+--------+-----------+
|  id|gender|     bdate|educ|  jobcat|  salary|salbegin|jobtime|prevexp|minority|high_salary|
+----+------+----------+----+--------+--------+--------+-------+-------+--------+-----------+
| 1.0|  Male|1952-02-03|  15| Manager| 57000.0| 27000.0|   98.0|  144.0|      No|        Yes|
| 2.0|  Male|1958-05-23|  16|Clerical| 40200.0| 18750.0|   98.0|   36.0|      No|         No|
| 3.0|Female|1929-07-26|  12|Clerical| 21450.0| 12000.0|   98.0|  381.0|      No|         No|
| 4.0|Female|1947-04-15|   8|Clerical| 21900.0| 13200.0|   98.0|  190.0|      No|         No|
| 5.0|  Male|1955-02-09|  15|Clerical| 45000.0| 21000.0|   98.0|  138.0|      No|         No|
| 6.0|  Male|1958-08-22|  15|Clerical| 32100.0| 13500.0|   98.0|   67.0|      No|         No|
| 7.0|  Male|1956-04-26|  15|Clerical| 36000.0| 18750.0|   98.0|  114.0|      No|         No|
| 8.0|Female|1966-05-06|  12|Clerical| 21900.0|  9750.0|   9

##Use groupBy().agg() Instead of groupBy().count()

In [23]:
from pyspark.sql.functions import count

df_grouped = df.groupBy("jobcat").agg(count("*").alias("total_count")).show()

+---------+-----------+
|   jobcat|total_count|
+---------+-----------+
|Custodial|         27|
| Clerical|        363|
|  Manager|         84|
+---------+-----------+

