#### Deliverable 4: Date Dimension Implementation

In [21]:
import pandas as pd
from datetime import datetime, timedelta
import sqlite3 as lite

In [22]:
conn = lite.connect("store1.db")
cur = conn.cursor()

output_file_path = "./output/"

In [23]:
current_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

dates = []

while current_date <= end_date:
    dates.append(current_date)
    current_date += timedelta(days=1)

date_df = pd.DataFrame(dates, columns=['Date'])
date_df.head()

Unnamed: 0,Date
0,2024-01-01
1,2024-01-02
2,2024-01-03
3,2024-01-04
4,2024-01-05


In [24]:
date_df.tail()

Unnamed: 0,Date
361,2024-12-27
362,2024-12-28
363,2024-12-29
364,2024-12-30
365,2024-12-31


In [25]:
# adding other columns
date_df['DateKey'] = date_df.index + 1   # 1 to 365
date_df['DayNumberInMonth'] = date_df['Date'].dt.day
date_df['DayNumberInYear'] = date_df['Date'].dt.dayofyear
date_df['WeekNumberInYear'] = date_df['Date'].dt.isocalendar().week
date_df['MonthNum'] = date_df['Date'].dt.month
date_df['MonthTxt'] = date_df['Date'].dt.strftime("%B")
date_df['Quarter'] = date_df['Date'].dt.quarter
date_df['Year'] = date_df['Date'].dt.year


# adding Fiscal year (ends in july)
date_df['Fiscal Year'] = date_df.apply(
    lambda x: x['Year'] if x['MonthNum'] >= 8 else x['Year'] - 1, axis=1)

date_df['isWeekend'] = date_df['Date'].dt.dayofweek >= 5

In [26]:
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend
357,2024-12-23,358,23,358,52,12,December,4,2024,2024,False
292,2024-10-19,293,19,293,42,10,October,4,2024,2024,True
255,2024-09-12,256,12,256,37,9,September,3,2024,2024,False
113,2024-04-23,114,23,114,17,4,April,2,2024,2023,False
115,2024-04-25,116,25,116,17,4,April,2,2024,2023,False


In [27]:
def get_season(date):
    year = date.year
    seasons = [
        ('Winter', datetime(year, 12, 21)),
        ('Spring', datetime(year, 3, 20)),
        ('Summer', datetime(year, 6, 21)),
        ('Fall', datetime(year, 9, 22))
    ]

    if date.month == 12 and date.day >= 21:
        return 'Winter'

    # other months, find the most recent season change
    for i in range(len(seasons) - 1, -1, -1):
        if date >= seasons[i][1]:
            return seasons[i][0]

    return 'Winter'


date_df['Season'] = date_df['Date'].apply(get_season)

In [28]:
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend,Season
142,2024-05-22,143,22,143,21,5,May,2,2024,2023,False,Spring
95,2024-04-05,96,5,96,14,4,April,2,2024,2023,False,Spring
320,2024-11-16,321,16,321,46,11,November,4,2024,2024,True,Fall
344,2024-12-10,345,10,345,50,12,December,4,2024,2024,False,Fall
331,2024-11-27,332,27,332,48,11,November,4,2024,2024,False,Fall


In [29]:
def is_holiday(date):
    year = date.year

    holidays = [
        datetime(year, 1, 1),    # New Year's Day
        datetime(year, 7, 4),    # Independence Day
        datetime(year, 12, 25)   # Christmas Day
    ]

    def get_monday_holiday(month, day):
        d = datetime(year, month, day)
        return d + timedelta(days=(7 - d.weekday()) % 7)

    holidays.extend([
        get_monday_holiday(1, 15),   # MLK Day (3rd Monday)
        get_monday_holiday(2, 15),   # Presidents Day (3rd Monday)
        get_monday_holiday(5, 25),   # Memorial Day (last Monday)
        get_monday_holiday(9, 1),    # Labor Day (1st Monday)
        get_monday_holiday(10, 8),   # Columbus Day (2nd Monday)
        get_monday_holiday(11, 11),  # Veterans Day
        get_monday_holiday(11, 22)   # Thanksgiving (4th Thursday)
    ])

    return date.date() in [h.date() for h in holidays]


date_df['isHoliday'] = date_df['Date'].apply(is_holiday)
date_df.sample(5)

Unnamed: 0,Date,DateKey,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isWeekend,Season,isHoliday
257,2024-09-14,258,14,258,37,9,September,3,2024,2024,True,Summer,False
21,2024-01-22,22,22,22,4,1,January,1,2024,2023,False,Winter,False
34,2024-02-04,35,4,35,5,2,February,1,2024,2023,True,Winter,False
272,2024-09-29,273,29,273,39,9,September,3,2024,2024,True,Fall,False
290,2024-10-17,291,17,291,42,10,October,4,2024,2024,False,Fall,False


In [30]:
# Reordering the columns
column_order = [
    'DateKey', 'Date', 'DayNumberInMonth', 'DayNumberInYear',
    'WeekNumberInYear', 'MonthNum', 'MonthTxt', 'Quarter', 'Year',
    'Fiscal Year', 'isHoliday', 'isWeekend', 'Season'
]

date_df = date_df[column_order]
date_df.sample(5)

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
16,17,2024-01-17,17,17,3,1,January,1,2024,2023,False,False,Winter
115,116,2024-04-25,25,116,17,4,April,2,2024,2023,False,False,Spring
13,14,2024-01-14,14,14,2,1,January,1,2024,2023,False,True,Winter
286,287,2024-10-13,13,287,41,10,October,4,2024,2024,False,True,Fall
218,219,2024-08-06,6,219,32,8,August,3,2024,2024,False,False,Summer


In [31]:
date_df[date_df['Date'] == "2024-01-15"]

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
14,15,2024-01-15,15,15,3,1,January,1,2024,2023,True,False,Winter


In [32]:
date_df.to_csv(f"{output_file_path}date_dimension.csv", index=False)

In [None]:
conn.execute("Drop table if exists DateDimension")

# Create the Date Dimension table
conn.execute("""
    CREATE TABLE DateDimension (
        DateKey INT PRIMARY KEY,
        Date Date,
        DayNumberInMonth INT,
        DayNumberInYear INT,
        WeekNumberInYear INT,
        MonthNum INT,
        MonthTxt VARCHAR(20),
        Quarter INT,
        Year INT,
        FiscalYear INT,
        isHoliday BOOLEAN,
        isWeekend BOOLEAN,
        Season VARCHAR(20)
    );
""")

<sqlite3.Cursor at 0x220bc1e3b40>

In [34]:
date_df['Date'] = pd.to_datetime(
    date_df['Date']).dt.strftime('%Y-%m-%d')

In [35]:
date_df.to_sql("DateDimension", conn, if_exists='replace', index=False)

366

In [36]:
cur.execute("select * from DateDimension")

res = cur.fetchall()

date_dimension = pd.DataFrame(res, columns=column_order)

date_dimension.sample(5)

Unnamed: 0,DateKey,Date,DayNumberInMonth,DayNumberInYear,WeekNumberInYear,MonthNum,MonthTxt,Quarter,Year,Fiscal Year,isHoliday,isWeekend,Season
126,127,2024-05-06,6,127,19,5,May,2,2024,2023,0,0,Spring
34,35,2024-02-04,4,35,5,2,February,1,2024,2023,0,1,Winter
353,354,2024-12-19,19,354,51,12,December,4,2024,2024,0,0,Fall
238,239,2024-08-26,26,239,35,8,August,3,2024,2024,0,0,Summer
8,9,2024-01-09,9,9,2,1,January,1,2024,2023,0,0,Winter


In [37]:
cur.close()
conn.close()