# Here are some of my PySpark codes

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    split,
    count,
    sum,
    avg,
    when,
    desc,
    broadcast,
)
import os
import random
import string
import json

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("dataframPractices")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/20 04:57:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/20 04:57:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Partitioning

In [None]:
"""Experiment with partitioning data"""
print("\n=== EXPERIMENT 1: PARTITIONING ===")

# Read sales data
sales = spark.read.option("header", "true").csv("/data/practice/sales.csv")

# Check default partitioning
print(f"Default partitions: {sales.rdd.getNumPartitions()}")

# Repartition to specific number
sales_10p = sales.repartition(10)
print(f"After repartition(10): {sales_10p.rdd.getNumPartitions()}")

# Repartition by key
sales_by_date = sales.repartition("timestamp")
print(f"After repartition by timestamp: {sales_by_date.rdd.getNumPartitions()}")

# Coalesce (narrow transformation - doesn't shuffle all data)
sales_coalesceed = sales_10p.coalesce(4)
print(f"After coalesce(4): {sales_coalesceed.rdd.getNumPartitions()}")

# Examine partition sizes
print("Partition distribution:")
part_sizes = sales_10p.rdd.glom().map(len).collect()
for i, size in enumerate(part_sizes):
    print(f"Partition {i}: {size} records")


25/03/20 05:58:33 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/03/20 05:58:48 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/03/20 05:59:03 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/03/20 05:59:18 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/03/20 05:59:33 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/03/20 05:59:48 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure th

# Narrow Transformations

In [None]:
"""Demonstrate narrow transformations (no shuffling)"""
print("\n=== EXPERIMENT 2: NARROW TRANSFORMATIONS ===")

# Read sales data
sales = spark.read.option("header", "true").csv("/data/practice/sales.csv")

# Define some narrow transformations
print("Executing narrow transformations...")

# 1. Filter - doesn't require shuffling
high_value_sales = sales.filter(col("price").cast("double") > 500)
print(f"High value sales count: {high_value_sales.count()}")

# 2. Map - doesn't require shuffling
sales_with_total = sales.withColumn(
    "total_value", col("quantity").cast("int") * col("price").cast("double")
)
sales_with_total.show(5)

# 3. Sample - doesn't require shuffling
sample_sales = sales.sample(fraction=0.1, seed=42)
print(f"Sample size: {sample_sales.count()}")

# Wide Transformations

In [None]:
"""Demonstrate wide transformations (shuffling required)"""
print("\n=== EXPERIMENT 3: WIDE TRANSFORMATIONS ===")

# Read sales data
sales = spark.read.option("header", "true").csv("/data/practice/sales.csv")
sales = sales.withColumn("price", col("price").cast("double"))
sales = sales.withColumn("quantity", col("quantity").cast("int"))
print("\033[31mSchema:\033[0m")
print(f"\033[31m{sales.printSchema()}\033[0m")

# 1. groupBy (wide transformation - requires shuffle)
print("Executing groupBy (wide transformation)...")
sales_by_product = sales.groupBy("product_id").agg(
    count("*").alias("order_count"),
    sum("quantity").alias("total_quantity"),
    avg("price").alias("avg_price"),
)
print("Sales aggregated by product_id:")
sales_by_product.show(5)

# 2. join (wide transformation - requires shuffle)
print("Executing join (wide transformation)...")
users = spark.read.option("header", "true").csv("/data/practice/users.csv")

# Join sales with users - this is a wide transformation
sales_with_user = sales.join(users, "user_id")
print("Sales joined with users:")
sales_with_user.select("order_id", "user_id", "name", "country", "price").show(5)

# 3. orderBy/sort (wide transformation - requires shuffle)
print("Executing orderBy (wide transformation)...")
sorted_sales = sales.orderBy(desc("price"))
print("Sales sorted by price:")
sorted_sales.show(5)

# Optimizations

In [None]:
"""Demonstrate various optimizations for distributed processing"""
print("\n=== EXPERIMENT 4: OPTIMIZATIONS ===")

sales = spark.read.option("header", "true").csv("/data/practice/sales.csv")
users = spark.read.option("header", "true").csv("/data/practice/users.csv")
products = spark.read.option("header", "true").csv("/data/practice/products.csv")

# Convert columns to appropriate types
sales = sales.withColumn("price", col("price").cast("double"))
sales = sales.withColumn("quantity", col("quantity").cast("int"))

# 1. Broadcast join (optimization for joining with small tables)
print("Demonstrating broadcast join...")
# Products table is small, so we can broadcast it
sales_with_product = sales.join(broadcast(products), "product_id")
print("Executed broadcast join - joining sales with products")

# 2. Caching/persisting
print("Demonstrating caching...")
# Cache a DataFrame we'll use multiple times
cached_sales = sales.cache()

# Use the cached DataFrame
print(f"Total sales count (from cache): {cached_sales.count()}")
print(
    f"Average price (from cache): {cached_sales.agg({'price': 'avg'}).collect()[0][0]}"
)

# 3. Partition pruning
print("Demonstrating partition pruning...")
# Partition data by timestamp (simulating a common pattern)
sales_by_month = sales.withColumn("month", split(col("timestamp"), "-")[1])
sales_by_month.write.partitionBy("month").mode("overwrite").parquet(
    "/data/practice/sales_partitioned"
)

# Read back with partition filtering
january_sales = spark.read.parquet("/data/practice/sales_partitioned").filter(
    col("month") == "01"
)
print(f"January sales count (using partition pruning): {january_sales.count()}")
