VALIDATED DIM DATE

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
gold_dim_date_path = "s3a://pedro-datalake-project/gold/dim_date/"

df_dim_date = spark.read.parquet(gold_dim_date_path)


In [3]:
total_rows = df_dim_date.count()
total_rows


11323

In [4]:
distinct_dates = df_dim_date.select("date_key").distinct().count()

total_rows, distinct_dates


(11323, 11323)

In [5]:
df_dim_date.agg(
    F.min("date").alias("min_date"),
    F.max("date").alias("max_date")
).show()


+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2000-01-01|2030-12-31|
+----------+----------+



In [6]:
df_dim_date.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in ["date_key", "date", "year", "month", "day", "day_of_week"]
]).show()


+--------+----+----+-----+---+-----------+
|date_key|date|year|month|day|day_of_week|
+--------+----+----+-----+---+-----------+
|       0|   0|   0|    0|  0|          0|
+--------+----+----+-----+---+-----------+



In [7]:
df_dim_date.groupBy("is_weekend").count().show()
df_dim_date.filter(F.col("weekday_name") == "Sunday").show(5)


+----------+-----+
|is_weekend|count|
+----------+-----+
|      true| 3236|
|     false| 8087|
+----------+-----+

+----------+--------+----+-----+---+-----------+------------+----------+-------+----------+---------+----------+
|      date|date_key|year|month|day|day_of_week|weekday_name|month_name|quarter|year_month|year_week|is_weekend|
+----------+--------+----+-----+---+-----------+------------+----------+-------+----------+---------+----------+
|2000-01-02|20000102|2000|    1|  2|          7|      Sunday|   January|      1|   2000-01|  2000-52|      true|
|2000-01-09|20000109|2000|    1|  9|          7|      Sunday|   January|      1|   2000-01|   2000-1|      true|
|2000-01-16|20000116|2000|    1| 16|          7|      Sunday|   January|      1|   2000-01|   2000-2|      true|
|2000-01-23|20000123|2000|    1| 23|          7|      Sunday|   January|      1|   2000-01|   2000-3|      true|
|2000-01-30|20000130|2000|    1| 30|          7|      Sunday|   January|      1|   2000-01|   