In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
cc.setupEnvironment()

In [2]:
#active session
spark = cc.startLocalCluster("DIM_DATE",4)
spark.getActiveSession()

In [7]:
from pyspark.sql.functions import *

beginDate = '2015-09-22'
endDate = '2025-12-31'

df_SQL = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id()+1 as dateSK ")


df_SQL.createOrReplaceTempView("neededDates")

spark.sql("select * from neededDates").show()

+------------+------+
|calendarDate|dateSK|
+------------+------+
|  2015-09-22|     1|
|  2015-09-23|     2|
|  2015-09-24|     3|
|  2015-09-25|     4|
|  2015-09-26|     5|
|  2015-09-27|     6|
|  2015-09-28|     7|
|  2015-09-29|     8|
|  2015-09-30|     9|
|  2015-10-01|    10|
|  2015-10-02|    11|
|  2015-10-03|    12|
|  2015-10-04|    13|
|  2015-10-05|    14|
|  2015-10-06|    15|
|  2015-10-07|    16|
|  2015-10-08|    17|
|  2015-10-09|    18|
|  2015-10-10|    19|
|  2015-10-11|    20|
+------------+------+
only showing top 20 rows



In [8]:
dimDate = spark.sql("select dateSK, calendarDate as date, year(calendarDate) as year, quarter(calendarDate) as QuarterOfYear, month(calendarDate) as month_nr, date_format(calendarDate,'MMMM') as month_name, date_format(calendarDate, 'EEEE') as day_name, dayofweek(calendarDate) AS day_nr, case when weekday(calendarDate) < 5 then 'Y' else 'N' end as IsWeekDay  from neededDates")


dimDate.show()

+------+----------+----+-------------+--------+----------+---------+------+---------+
|dateSK|      date|year|QuarterOfYear|month_nr|month_name| day_name|day_nr|IsWeekDay|
+------+----------+----+-------------+--------+----------+---------+------+---------+
|     1|2015-09-22|2015|            3|       9| September|  Tuesday|     3|        Y|
|     2|2015-09-23|2015|            3|       9| September|Wednesday|     4|        Y|
|     3|2015-09-24|2015|            3|       9| September| Thursday|     5|        Y|
|     4|2015-09-25|2015|            3|       9| September|   Friday|     6|        Y|
|     5|2015-09-26|2015|            3|       9| September| Saturday|     7|        N|
|     6|2015-09-27|2015|            3|       9| September|   Sunday|     1|        N|
|     7|2015-09-28|2015|            3|       9| September|   Monday|     2|        Y|
|     8|2015-09-29|2015|            3|       9| September|  Tuesday|     3|        Y|
|     9|2015-09-30|2015|            3|       9| Septem

In [9]:
df_SparkSQL = df_SQL.withColumn("year", date_format("calendarDate",'yyyy')).withColumn("month",date_format("calendarDate",'MMMM')).withColumn("lastDayOfMonth", expr("case when calendarDate = last_day (calendarDate) then 'Y' else 'N' end as IsLastDayOfMonth"))

df_SparkSQL.show()

+------------+------+----+---------+--------------+
|calendarDate|dateSK|year|    month|lastDayOfMonth|
+------------+------+----+---------+--------------+
|  2015-09-22|     1|2015|September|             N|
|  2015-09-23|     2|2015|September|             N|
|  2015-09-24|     3|2015|September|             N|
|  2015-09-25|     4|2015|September|             N|
|  2015-09-26|     5|2015|September|             N|
|  2015-09-27|     6|2015|September|             N|
|  2015-09-28|     7|2015|September|             N|
|  2015-09-29|     8|2015|September|             N|
|  2015-09-30|     9|2015|September|             Y|
|  2015-10-01|    10|2015|  October|             N|
|  2015-10-02|    11|2015|  October|             N|
|  2015-10-03|    12|2015|  October|             N|
|  2015-10-04|    13|2015|  October|             N|
|  2015-10-05|    14|2015|  October|             N|
|  2015-10-06|    15|2015|  October|             N|
|  2015-10-07|    16|2015|  October|             N|
|  2015-10-0

In [10]:
dimDate.write.format("delta").mode("overwrite").saveAsTable("dimDate")
dimDate.repartition(1).write.format("parquet").mode("overwrite").saveAsTable("dimDate_pq")

In [11]:
spark.stop()