#### Deliverable 4: Date Dimension Implementation

In [1]:
import pandas as pd
from datetime import datetime, timedelta
import sqlite3 as lite

In [2]:
conn = lite.connect("store1.db")
cur = conn.cursor()

output_file_path = "./output/"

In [3]:
current_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

dates = []

while current_date <= end_date:
    dates.append(current_date)
    current_date += timedelta(days=1)

date_df = pd.DataFrame(dates, columns=['Date'])
date_df.head()

Unnamed: 0,Date
0,2024-01-01
1,2024-01-02
2,2024-01-03
3,2024-01-04
4,2024-01-05


In [4]:
date_df.tail()

Unnamed: 0,Date
361,2024-12-27
362,2024-12-28
363,2024-12-29
364,2024-12-30
365,2024-12-31


In [5]:
# adding other columns
date_df['DateKey'] = date_df.index + 1   # 1 to 365
date_df['DayNumberInMonth'] = date_df['Date'].dt.day
date_df['DayNumberInYear'] = date_df['Date'].dt.dayofyear
date_df['WeekNumberInYear'] = date_df['Date'].dt.isocalendar().week
date_df['MonthNum'] = date_df['Date'].dt.month
date_df['MonthTxt'] = date_df['Date'].dt.strftime("%B")
date_df['Quarter'] = date_df['Date'].dt.quarter
date_df['Year'] = date_df['Date'].dt.year


# adding Fiscal year (ends in july)
date_df['Fiscal Year'] = date_df.apply(
    lambda x: x['Year'] if x['MonthNum'] >= 8 else x['Year'] - 1, axis=1)

date_df['isWeekend'] = date_df['Date'].dt.dayofweek >= 5

In [6]:
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend
2,2024-01-03,3,3,3,1,1,January,1,2024,2023,False
123,2024-05-03,124,3,124,18,5,May,2,2024,2023,False
311,2024-11-07,312,7,312,45,11,November,4,2024,2024,False
274,2024-10-01,275,1,275,40,10,October,4,2024,2024,False
281,2024-10-08,282,8,282,41,10,October,4,2024,2024,False


In [7]:
def get_season(date):
    year = date.year
    seasons = [
        ('Winter', datetime(year, 12, 21)),
        ('Spring', datetime(year, 3, 20)),
        ('Summer', datetime(year, 6, 21)),
        ('Fall', datetime(year, 9, 22))
    ]

    if date.month == 12 and date.day >= 21:
        return 'Winter'

    # other months, find the most recent season change
    for i in range(len(seasons) - 1, -1, -1):
        if date >= seasons[i][1]:
            return seasons[i][0]

    return 'Winter'


date_df['Season'] = date_df['Date'].apply(get_season)

In [8]:
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend,Season
87,2024-03-28,88,28,88,13,3,March,1,2024,2023,False,Spring
357,2024-12-23,358,23,358,52,12,December,4,2024,2024,False,Winter
118,2024-04-28,119,28,119,17,4,April,2,2024,2023,True,Spring
226,2024-08-14,227,14,227,33,8,August,3,2024,2024,False,Summer
145,2024-05-25,146,25,146,21,5,May,2,2024,2023,True,Spring


In [9]:
def is_holiday(date):
    year = date.year

    holidays = [
        datetime(year, 1, 1),    # New Year's Day
        datetime(year, 7, 4),    # Independence Day
        datetime(year, 12, 25)   # Christmas Day
    ]

    def get_monday_holiday(month, day):
        d = datetime(year, month, day)
        return d + timedelta(days=(7 - d.weekday()) % 7)

    holidays.extend([
        get_monday_holiday(1, 15),   # MLK Day (3rd Monday)
        get_monday_holiday(2, 15),   # Presidents Day (3rd Monday)
        get_monday_holiday(5, 25),   # Memorial Day (last Monday)
        get_monday_holiday(9, 1),    # Labor Day (1st Monday)
        get_monday_holiday(10, 8),   # Columbus Day (2nd Monday)
        get_monday_holiday(11, 11),  # Veterans Day
        get_monday_holiday(11, 22)   # Thanksgiving (4th Thursday)
    ])

    return date.date() in [h.date() for h in holidays]


date_df['isHoliday'] = date_df['Date'].apply(is_holiday)
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend,Season,isHoliday
65,2024-03-06,66,6,66,10,3,March,1,2024,2023,False,Winter,False
255,2024-09-12,256,12,256,37,9,September,3,2024,2024,False,Summer,False
163,2024-06-12,164,12,164,24,6,June,2,2024,2023,False,Spring,False
336,2024-12-02,337,2,337,49,12,December,4,2024,2024,False,Fall,False
257,2024-09-14,258,14,258,37,9,September,3,2024,2024,True,Summer,False


In [10]:
# Reordering the columns
column_order = [
    'DateKey', 'Date', 'DayNumberInMonth', 'DayNumberInYear',
    'WeekNumberInYear', 'MonthNum', 'MonthTxt', 'Quarter', 'Year',
    'Fiscal Year', 'isHoliday', 'isWeekend', 'Season'
]

date_df = date_df[column_order]
date_df.sample(5)

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
140,141,2024-05-20,20,141,21,5,May,2,2024,2023,False,False,Spring
246,247,2024-09-03,3,247,36,9,September,3,2024,2024,False,False,Summer
307,308,2024-11-03,3,308,44,11,November,4,2024,2024,False,True,Fall
327,328,2024-11-23,23,328,47,11,November,4,2024,2024,False,True,Fall
191,192,2024-07-10,10,192,28,7,July,3,2024,2023,False,False,Summer


In [11]:
date_df[date_df['Date'] == "2024-01-15"]

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
14,15,2024-01-15,15,15,3,1,January,1,2024,2023,True,False,Winter


In [12]:
date_df.to_csv(f"{output_file_path}date_dimension.csv", index=False)

In [13]:
conn.execute("Drop table if exists DateDimension")

# Create the Date Dimension table
conn.execute("""
    CREATE TABLE DateDimension (
        DateKey INT PRIMARY KEY,
        Date Date,
        DayNumberInMonth INT,
        DayNumberInYear INT,
        WeekNumberInYear INT,
        MonthNum INT,
        MonthTxt VARCHAR(20),
        Quarter INT,
        Year INT,
        FiscalYear INT,
        isHoliday BOOLEAN,
        isWeekend BOOLEAN,
        Season VARCHAR(20)
    );
""")

<sqlite3.Cursor at 0x23dfe425740>

In [14]:
date_df['Date'] = pd.to_datetime(
    date_df['Date']).dt.strftime('%Y-%m-%d')

In [15]:
date_df.to_sql("DateDimension", conn, if_exists='replace', index=False)

366

In [16]:
cur.execute("select * from DateDimension")

res = cur.fetchall()

date_dimension = pd.DataFrame(res, columns=column_order)

date_dimension.tail(5)

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
361,362,2024-12-27,27,362,52,12,December,4,2024,2024,0,0,Winter
362,363,2024-12-28,28,363,52,12,December,4,2024,2024,0,1,Winter
363,364,2024-12-29,29,364,52,12,December,4,2024,2024,0,1,Winter
364,365,2024-12-30,30,365,1,12,December,4,2024,2024,0,0,Winter
365,366,2024-12-31,31,366,1,12,December,4,2024,2024,0,0,Winter


In [17]:
cur.close()
conn.close()