# DE1 — Final Project Notebook
> Author : Badr TAJINI - Data Engineering I - ESIEE 2025-2026
---

This is the primary executable artifact. Fill config, run baseline, then optimized pipeline, and record evidence.

## 0. Load config

In [None]:
import yaml, pathlib, datetime
from pyspark.sql import SparkSession, functions as F, types as T

with open("project/de1_project_config.yml") as f:
    CFG = yaml.safe_load(f)

spark = SparkSession.builder.appName("de1-project").getOrCreate()
CFG


## 1. Bronze — landing raw data

In [None]:
raw_glob = CFG["paths"]["raw_csv_glob"]
bronze = CFG["paths"]["bronze"]
proof = CFG["paths"]["proof"]

df_raw = (spark.read.option("header","true").csv(raw_glob))
df_raw.write.mode("overwrite").csv(bronze)  # keep raw as CSV copy
print("Bronze written:", bronze)


## 2. Silver — cleaning and typing

In [None]:
silver = CFG["paths"]["silver"]

# Example typing; adapt to dataset
from pyspark.sql import functions as F, types as T
df_silver = (df_raw
    .withColumn("metric", F.col("metric").cast("double"))
    .withColumn("date", F.to_date("date"))
    .dropna(subset=["metric","date"]))

df_silver.write.mode("overwrite").parquet(silver)
print("Silver written:", silver)


## 3. Gold — analytics tables

In [None]:
gold = CFG["paths"]["gold"]
partition_by = CFG["layout"]["partition_by"]

# Example gold Q1
gold_q1 = (df_silver.groupBy("date").agg(F.sum("metric").alias("sum_metric")))
(gold_q1.write.mode("overwrite").partitionBy(*partition_by).parquet(f"{gold}/q1_daily"))

print("Gold written:", gold)


## 4. Baseline plans and metrics

In [None]:
import os, datetime as _dt, pathlib
pathlib.Path(proof).mkdir(parents=True, exist_ok=True)

# Example baseline plan
plan = gold_q1._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/baseline_q1_plan.txt","w") as f:
    f.write(str(_dt.datetime.now())+"\n")
    f.write(plan)
print("Saved baseline plan. Record Spark UI metrics now.")


## 5. Optimization — layout and joins

In [None]:
# Example: narrow projection and pre‑aggregation before write
df_silver_min = df_silver.select("date","metric")
gold_q1_opt = (df_silver_min.groupBy("date").agg(F.sum("metric").alias("sum_metric")))
gold_q1_opt.write.mode("overwrite").partitionBy(*partition_by).parquet(f"{gold}/q1_daily_opt")

plan_opt = gold_q1_opt._jdf.queryExecution().executedPlan().toString()
with open(f"{proof}/optimized_q1_plan.txt","w") as f:
    f.write(str(_dt.datetime.now())+"\n")
    f.write(plan_opt)
print("Saved optimized plan. Record Spark UI metrics now.")


## 6. Cleanup

In [None]:
spark.stop()
print("Spark session stopped.")
