In [0]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()
delta = spark.read.table("chocolate_sales")

## ðŸ”¹ LEVEL 1 â€” Warm-up (Basics)

1. Show only the following columns:

   * `Shipdate`
   * `Boxes`
   * `Amount`

2. Display only **rows where Boxes is greater than 10**.

3. Find how many rows exist in the table.

In [0]:
# Show only the following columns
delta.select("Shipdate", "Boxes", "Amount").show()

# Display only rows where Boxes is greater than 10
delta.filter(F.col("Boxes") > 10).show()

# Find how many rows exist in the table
delta.count()

## ðŸ”¹ LEVEL 2 â€” Column Handling

4. Convert `Shipdate` from **string â†’ date** and keep all other columns.

5. Create a new column called `Amount_per_Box`.

   * If Boxes is 0 or NULL, result should be 0.

6. Rename column `Order_Status` to `Status`.

In [0]:
# Convert Shipdate from string â†’ date and keep all other columns.
new_delta = delta.withColumn("Shipdate", F.to_date(F.col("Shipdate"), "dd-MMM-yy"))
new_delta.printSchema()

# Create a new column called Amount_per_Box, If Boxes is 0 or NULL, result should be 0
new_delta = new_delta.withColumn("Amount_per_Box", F.when((F.col('Boxes') == 0) | (F.col('Boxes').isNull()), F.lit(0)).otherwise(F.col('Amount') / (F.col('Boxes'))))

# Rename column Order_Status to Status
new_delta = new_delta.withColumnRenamed("Order_Status", "Status")
new_delta.show(5)
                     

## ðŸ”¹ LEVEL 3 â€” Filtering Logic

7. Keep only rows where:

   * `Status` = "Completed"
   * `Amount` > 100

8. Find how many such rows exist.

In [0]:
# Keep only rows where and Find how many such rows exist
new_delta.filter((F.col('Status') == 'Delivered') & (F.col('Amount') > 100)).count()

## ðŸ”¹ LEVEL 4 â€” Aggregations (Core Spark Thinking)

9. Find **total Amount and total Boxes** across the entire table.

10. Find **total Amount and total Boxes per Shipdate**.

11. Find **number of orders per Shipdate**.

In [0]:
# Find total Amount and total Boxes across the entire table
new_delta.agg(
    F.sum("Amount").alias('total_amount'),
    F.sum("Boxes").alias('total_boxes')
).show()

# Find total Amount and total Boxes per Shipdate
new_delta.groupby('Shipdate').agg(
    F.sum("Amount").alias('total_amount_per_shipdate'),
    F.sum(F.col('Boxes')).alias('total_boxes_per_shipdate')
).show()

# Find number of orders per Shipdate
group_by_orders = new_delta.filter(F.col("Status") == 'Delivered').groupby('Shipdate').agg(F.count(F.lit(1)).alias('number of orders'))
group_by_orders.show()



## ðŸ”¹ LEVEL 5 â€” Slightly Tricky

12. Find Shipdates where **total Boxes sold > 100**.

13. Sort Shipdates by **total Amount (descending)** and show top 5.

In [0]:
# Find Shipdates where total Boxes sold > 100
# new_delta.filter((F.col("Boxes").isNotNull()) | (F.col("Boxes") > 100)).select(F.col("Shipdate"), F.col("Boxes").alias("total_boxes_sold")).show(5)
(
    new_delta.groupBy("Shipdate")
        .agg(F.sum("Boxes").alias("total_box_sold"))
        .filter(F.col("total_box_sold") > 100)
        .show(5)
)

# Sort Shipdates by total Amount (descending) and show top 5
(
    new_delta.groupBy("Shipdate")
        .agg(F.sum("Amount").alias("total_amount"))
        .orderBy(F.desc("total_amount"))
        .limit(5)
        .show()
)


## ðŸ”¹ LEVEL 6 â€” Delta & Validation

14. Write the **daily aggregation result** into a new Delta table.

15. Verify:

* Table format is Delta
* Column datatypes are correct

In [0]:
new_delta.show(5)

In [0]:
daily_chocolate_sales = (
    new_delta
        .groupBy("Shipdate")
            .agg(
                    F.sum("Amount").alias("total_amount"),
                    F.sum("Boxes").alias("total_boxes"),
                    F.avg("Amount_per_Box").alias("average_amount_per_box")
                )
)

daily_chocolate_sales.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable("daily_chocolate_sales")

