#### Deliverable 5: Sales Fact Table Implementation

In [37]:
import pandas as pd
import numpy as np
import sqlite3 as lite
from datetime import datetime

In [38]:
store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name":  "sales_transactions"
    }
}

In [39]:
def standardize_date(date_str, store_name):
    """Convert various date formats to YYYYMMDD standard"""
    if pd.isna(date_str):
        return None

    if store_name == 'Store 1':
        formats = ['%Y-%m-%d']
    else:
        formats = ['%Y-%m-%d', '%y/%m/%d']

    for fmt in formats:
        try:
            dt = pd.to_datetime(date_str, format=fmt)
            return int(dt.strftime('%Y%m%d'))
        except:
            continue

    print(f"Warning: Could not parse date from {store_name}: {date_str}")
    return None

In [None]:
# Function to load December data from a store's database
def load_december_data(store):
    conn = lite.connect(store['database_name'])

    table_name = store['transactions_table_name']
    _date = 'date'
    
    # Load December transactions (filter for month = 12)
    december_transactions_query = """
        SELECT * 
        FROM """ + table_name + """ 
        WHERE """ + _date + """ LIKE '%/12/%' OR """ + _date + """ LIKE '%-12-%' OR """ + _date + """ LIKE '%12%'
    """
    
    print(december_transactions_query)
    december_transactions_df = pd.read_sql_query(
        december_transactions_query, conn)

    conn.close()

    return december_transactions_df

In [41]:
import sqlite3

conn = sqlite3.connect("store1.db")
date_dim = pd.read_sql("select DateKey, Date from DateDimension", conn)


In [42]:
# Main ETL Process
all_transactions = []

for store_name, config in store_databases.items():
    print(f"Processing {store_name}...")

    transactions = load_december_data(config)
    transactions['StoreKey'] = store_name.split(" ")[1]
    transactions['temp_date'] = pd.to_datetime(transactions['date'])

    transactions = transactions.merge(
        date_dim,
        left_on='temp_date',
        right_on=pd.to_datetime(date_dim['Date']),
        how='left'
    ).drop(columns=['temp_date', 'Date'])

    all_transactions.append(transactions)

Processing Store 1...

        SELECT * 
        FROM sales_transactions
Processing Store 2...

        SELECT * 
        FROM sales_transactions
Processing Store 3...

        SELECT * 
        FROM sales_transactions
Processing Store 4...

        SELECT * 
        FROM sales_transactions


KeyboardInterrupt: 

In [None]:
combine_transactions = pd.concat(all_transactions)

In [None]:
combine_transactions.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey
0,2024-01-12,1,42358001,2.08,200,122,1,12
1,2024-01-12,1,42525001,4.39,36,12,1,12
2,2024-01-12,1,43223001,1.61,73,24,1,12
3,2024-01-12,1,42172001,3.3,67,21,1,12
4,2024-01-12,1,42641001,4.39,65,21,1,12


In [None]:
len(combine_transactions)

7664721

In [None]:
products = pd.read_sql("select ProductKey, SKU from ProductDimension", conn)
stores = pd.read_sql("select StoreKey from StoreDimension", conn)

#### Creating Sales Fact table (Transaction Level)

In [None]:
# Ensure StoreKey columns have the same data type
combine_transactions['StoreKey'] = combine_transactions['StoreKey'].astype(int)

fact_data = (
    combine_transactions
        .merge(products, left_on='sku', right_on="SKU", how='left')
        .merge(stores, left_on='StoreKey', right_on="StoreKey", how='left')
        .dropna(subset=['ProductKey', 'StoreKey', 'DateKey'])
)

In [None]:
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,SKU
0,2024-01-12,1,42358001,2.08,200,122,1,12,278,42358001
1,2024-01-12,1,42525001,4.39,36,12,1,12,445,42525001
2,2024-01-12,1,43223001,1.61,73,24,1,12,1138,43223001
3,2024-01-12,1,42172001,3.3,67,21,1,12,92,42172001
4,2024-01-12,1,42641001,4.39,65,21,1,12,561,42641001


### Calculate metrics


In [None]:
fact_data['QuantitySold'] = fact_data['cases_ordered'] * 12   #12 items per case
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,SKU,QuantitySold
0,2024-01-12,1,42358001,2.08,200,122,1,12,278,42358001,1464
1,2024-01-12,1,42525001,4.39,36,12,1,12,445,42525001,144
2,2024-01-12,1,43223001,1.61,73,24,1,12,1138,43223001,288
3,2024-01-12,1,42172001,3.3,67,21,1,12,92,42172001,252
4,2024-01-12,1,42641001,4.39,65,21,1,12,561,42641001,252


In [None]:
fact_data['TotalDollarSales'] = fact_data['salesPrice'] * fact_data['QuantitySold']
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,SKU,QuantitySold,TotalDollarSales
0,2024-01-12,1,42358001,2.08,200,122,1,12,278,42358001,1464,3045.12
1,2024-01-12,1,42525001,4.39,36,12,1,12,445,42525001,144,632.16
2,2024-01-12,1,43223001,1.61,73,24,1,12,1138,43223001,288,463.68
3,2024-01-12,1,42172001,3.3,67,21,1,12,92,42172001,252,831.6
4,2024-01-12,1,42641001,4.39,65,21,1,12,561,42641001,252,1106.28


In [None]:
fact_data['TotalCostToStore'] = fact_data['salesPrice'] * 0.7 * fact_data['QuantitySold']
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,SKU,QuantitySold,TotalDollarSales,TotalCostToStore
0,2024-01-12,1,42358001,2.08,200,122,1,12,278,42358001,1464,3045.12,2131.584
1,2024-01-12,1,42525001,4.39,36,12,1,12,445,42525001,144,632.16,442.512
2,2024-01-12,1,43223001,1.61,73,24,1,12,1138,43223001,288,463.68,324.576
3,2024-01-12,1,42172001,3.3,67,21,1,12,92,42172001,252,831.6,582.12
4,2024-01-12,1,42641001,4.39,65,21,1,12,561,42641001,252,1106.28,774.396


In [None]:
fact_data['GrossProfit'] = fact_data['TotalDollarSales'] - fact_data['TotalCostToStore']
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,SKU,QuantitySold,TotalDollarSales,TotalCostToStore,GrossProfit
0,2024-01-12,1,42358001,2.08,200,122,1,12,278,42358001,1464,3045.12,2131.584,913.536
1,2024-01-12,1,42525001,4.39,36,12,1,12,445,42525001,144,632.16,442.512,189.648
2,2024-01-12,1,43223001,1.61,73,24,1,12,1138,43223001,288,463.68,324.576,139.104
3,2024-01-12,1,42172001,3.3,67,21,1,12,92,42172001,252,831.6,582.12,249.48
4,2024-01-12,1,42641001,4.39,65,21,1,12,561,42641001,252,1106.28,774.396,331.884


#### Creating final fact table

In [None]:
sales_fact = fact_data[[
    'DateKey', 'customer_number', 'ProductKey', 'StoreKey',
    'QuantitySold', 'TotalDollarSales', 'TotalCostToStore', 'GrossProfit'
]].rename(columns={
    'customer_number': 'DailyCust#'
})

In [None]:
sales_fact.head()

Unnamed: 0,DateKey,DailyCust#,ProductKey,StoreKey,QuantitySold,TotalDollarSales,TotalCostToStore,GrossProfit
0,12,1,278,1,1464,3045.12,2131.584,913.536
1,12,1,445,1,144,632.16,442.512,189.648
2,12,1,1138,1,288,463.68,324.576,139.104
3,12,1,92,1,252,831.6,582.12,249.48
4,12,1,561,1,252,1106.28,774.396,331.884


In [None]:
curr = conn.cursor()

curr.execute("DROP TABLE IF EXISTS SalesFact_TransactionLevel ")
curr.execute("""
    CREATE TABLE SalesFact_TransactionLevel (
        DateKey INT NOT NULL,
        "DailyCust#" INT NOT NULL,
        ProductKey INT NOT NULL,
        StoreKey INT NOT NULL,
        QuantitySold INT NOT NULL,
        TotalDollarSales REAL NOT NULL,
        TotalCostToStore REAL NOT NULL,
        GrossProfit REAL NOT NULL,
        PRIMARY KEY (DateKey, "DailyCust#", ProductKey),
        FOREIGN KEY (DateKey) REFERENCES DateDimension(DateKey),
        FOREIGN KEY (ProductKey) REFERENCES ProductDimension(ProductKey),
        FOREIGN KEY (StoreKey) REFERENCES StoreDimension(StoreKey)
    );
""")

<sqlite3.Cursor at 0x1ed449f5540>

In [None]:
sales_fact.to_sql("SalesFact_TransactionLevel", conn, if_exists='replace', index=False)

7664721

In [None]:
curr.close()
conn.close()