CREATED DIM DATE

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
start_date = "2000-01-01"
end_date = "2030-12-31"
df_dates = spark.sql(f"""
    SELECT explode(
        sequence(
            to_date('{start_date}'),
            to_date('{end_date}'),
            interval 1 day
        )
    ) AS date
""")


In [3]:
df_dim_date = (
    df_dates
    .withColumn("date_key", F.date_format("date", "yyyyMMdd").cast("int"))
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .withColumn("day", F.dayofmonth("date"))

    .withColumn("spark_dayofweek", F.dayofweek("date"))

    .withColumn(
        "day_of_week",
        F.when(F.col("spark_dayofweek") == 1, F.lit(7))
         .otherwise(F.col("spark_dayofweek") - 1)
    )

    .withColumn(
        "weekday_name",
        F.when(F.col("day_of_week") == 1, F.lit("Monday"))
         .when(F.col("day_of_week") == 2, F.lit("Tuesday"))
         .when(F.col("day_of_week") == 3, F.lit("Wednesday"))
         .when(F.col("day_of_week") == 4, F.lit("Thursday"))
         .when(F.col("day_of_week") == 5, F.lit("Friday"))
         .when(F.col("day_of_week") == 6, F.lit("Saturday"))
         .otherwise(F.lit("Sunday"))
    )

    .withColumn(
        "month_name",
        F.when(F.col("month") == 1, F.lit("January"))
         .when(F.col("month") == 2, F.lit("February"))
         .when(F.col("month") == 3, F.lit("March"))
         .when(F.col("month") == 4, F.lit("April"))
         .when(F.col("month") == 5, F.lit("May"))
         .when(F.col("month") == 6, F.lit("June"))
         .when(F.col("month") == 7, F.lit("July"))
         .when(F.col("month") == 8, F.lit("August"))
         .when(F.col("month") == 9, F.lit("September"))
         .when(F.col("month") == 10, F.lit("October"))
         .when(F.col("month") == 11, F.lit("November"))
         .otherwise(F.lit("December"))
    )

    .withColumn("quarter", F.quarter("date"))
    .withColumn("year_month", F.date_format("date", "yyyy-MM"))
    .withColumn("year_week", F.concat_ws("-", F.year("date"), F.weekofyear("date")))
    .withColumn("is_weekend", F.col("day_of_week").isin([6, 7]))

    .drop("spark_dayofweek")
)


In [4]:
df_dim_date.printSchema()
df_dim_date.show(10)
df_dim_date.count()
df_dim_date.select("date_key").distinct().count()


root
 |-- date: date (nullable = false)
 |-- date_key: integer (nullable = true)
 |-- year: integer (nullable = false)
 |-- month: integer (nullable = false)
 |-- day: integer (nullable = false)
 |-- day_of_week: integer (nullable = false)
 |-- weekday_name: string (nullable = false)
 |-- month_name: string (nullable = false)
 |-- quarter: integer (nullable = false)
 |-- year_month: string (nullable = false)
 |-- year_week: string (nullable = false)
 |-- is_weekend: boolean (nullable = false)

+----------+--------+----+-----+---+-----------+------------+----------+-------+----------+---------+----------+
|      date|date_key|year|month|day|day_of_week|weekday_name|month_name|quarter|year_month|year_week|is_weekend|
+----------+--------+----+-----+---+-----------+------------+----------+-------+----------+---------+----------+
|2000-01-01|20000101|2000|    1|  1|          6|    Saturday|   January|      1|   2000-01|  2000-52|      true|
|2000-01-02|20000102|2000|    1|  2|          7| 

11323