In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfig as cc
cc.setupEnvironment()

## Start the cluster
Look at the getActiveSession() method in the ConnectionConfig.py file. It will return the active session. It will also add the delta package to the session and add extra jars to the session. The jars are needed to connect to the SQL Server database.

In [2]:
spark = cc.startLocalCluster("DIM_DATE",4)
spark.getActiveSession()

# Creating Date dimension


## Step 1: Generate rows for a sequence of dates

In [3]:
# EXTRACT
beginDate = '2019-01-01'
endDate = '2024-12-31'

df_SQL = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate, monotonically_increasing_id() as date_SK ")


df_SQL.createOrReplaceTempView('neededDates')

spark.sql("select * from neededDates").show(n=20)

+------------+-------+
|calendarDate|date_SK|
+------------+-------+
|  2019-01-01|      0|
|  2019-01-02|      1|
|  2019-01-03|      2|
|  2019-01-04|      3|
|  2019-01-05|      4|
|  2019-01-06|      5|
|  2019-01-07|      6|
|  2019-01-08|      7|
|  2019-01-09|      8|
|  2019-01-10|      9|
|  2019-01-11|     10|
|  2019-01-12|     11|
|  2019-01-13|     12|
|  2019-01-14|     13|
|  2019-01-15|     14|
|  2019-01-16|     15|
|  2019-01-17|     16|
|  2019-01-18|     17|
|  2019-01-19|     18|
|  2019-01-20|     19|
+------------+-------+
only showing top 20 rows



In [4]:
# TRANSFORM
dimDate = spark.sql("select date_SK, \
  year(calendarDate) * 10000 + month(calendarDate) * 100 + day(calendarDate) as dateInt, \
  CalendarDate, \
  year(calendarDate) AS CalendarYear, \
  date_format(calendarDate, 'MMMM') as CalendarMonth, \
  month(calendarDate) as MonthOfYear, \
  date_format(calendarDate, 'EEEE') as CalendarDay, \
  dayofweek(calendarDate) AS DayOfWeek, \
  weekday(calendarDate) + 1 as DayOfWeekStartMonday, \
  case \
    when weekday(calendarDate) < 5 then 'Y' \
    else 'N' \
  end as IsWeekDay, \
  dayofmonth(calendarDate) as DayOfMonth, \
  case \
    when calendarDate = last_day(calendarDate) then 'Y' \
    else 'N' \
  end as IsLastDayOfMonth, \
  dayofyear(calendarDate) as DayOfYear, \
  weekofyear(calendarDate) as WeekOfYearIso, \
  quarter(calendarDate) as QuarterOfYear \
from  \
  neededDates \
order by \
  calendarDate")

dimDate.show()

+-------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|date_SK| dateInt|CalendarDate|CalendarYear|CalendarMonth|MonthOfYear|CalendarDay|DayOfWeek|DayOfWeekStartMonday|IsWeekDay|DayOfMonth|IsLastDayOfMonth|DayOfYear|WeekOfYearIso|QuarterOfYear|
+-------+--------+------------+------------+-------------+-----------+-----------+---------+--------------------+---------+----------+----------------+---------+-------------+-------------+
|      0|20190101|  2019-01-01|        2019|      January|          1|    Tuesday|        3|                   2|        Y|         1|               N|        1|            1|            1|
|      1|20190102|  2019-01-02|        2019|      January|          1|  Wednesday|        4|                   3|        Y|         2|               N|        2|            1|            1|
|      2|20190103|  2019-01-03|        2019|      

In [5]:
# LOAD
dimDate.write.format("delta").mode("overwrite").saveAsTable("dim_date")


In [6]:
spark.stop()