#### Deliverable 5: Sales Fact Table Implementation

In [1]:
import pandas as pd
import numpy as np
import sqlite3 as lite

In [2]:
output_file_path = "./output/"

In [3]:
store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name":  "sales_transactions"
    }
}

In [4]:
def standardize_date(date_str, store_name):
    """Convert various date formats to YYYYMMDD standard"""
    if pd.isna(date_str):
        return None

    if store_name == 'Store 1':
        formats = ['%Y-%m-%d']
    else:
        formats = ['%Y-%m-%d', '%y/%m/%d']

    for fmt in formats:
        try:
            dt = pd.to_datetime(date_str, format=fmt)
            return int(dt.strftime('%Y%m%d'))
        except:
            continue

    print(f"Warning: Could not parse date from {store_name}: {date_str}")
    return None

In [5]:
# Function to load December data from a store's database
def load_december_data(store):
    conn = lite.connect(store['database_name'])

    table_name = store['transactions_table_name']
    _date = 'date'
    
    # Load December transactions (filter for month = 12)
    december_transactions_query = """
        SELECT * 
        FROM """ + table_name + """ 
        WHERE """ + _date + """ LIKE '%/12/%' OR """ + _date + """ LIKE '%-12-%' OR """ + _date + """ LIKE '%12%'
    """

    print(december_transactions_query)
    december_transactions_df = pd.read_sql_query(
        december_transactions_query, conn)

    conn.close()

    return december_transactions_df

In [6]:
import sqlite3

conn = sqlite3.connect("store1.db")
date_dim = pd.read_sql("select DateKey, Date from DateDimension", conn)


In [7]:
# Main ETL Process
all_transactions = []

for store_name, config in store_databases.items():
    print(f"Processing {store_name}...")

    transactions = load_december_data(config)
    transactions['StoreKey'] = store_name.split(" ")[1]
    transactions['temp_date'] = pd.to_datetime(transactions['date'])

    transactions = transactions.merge(
        date_dim,
        left_on='temp_date',
        right_on=pd.to_datetime(date_dim['Date']),
        how='left'
    ).drop(columns=['temp_date', 'Date'])

    all_transactions.append(transactions)

Processing Store 1...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
Processing Store 2...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
Processing Store 3...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    
Processing Store 4...

        SELECT * 
        FROM sales_transactions 
        WHERE date LIKE '%/12/%' OR date LIKE '%-12-%' OR date LIKE '%12%'
    


In [8]:
combine_transactions = pd.concat(all_transactions)

In [9]:
combine_transactions.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey
0,2024-01-12,1,42671001,4.39,30,12,1,12
1,2024-01-12,1,42484001,1.42,72,21,1,12
2,2024-01-12,1,44140001,3.04,68,21,1,12
3,2024-01-12,1,42215001,0.64,75,22,1,12
4,2024-01-12,1,42535001,4.39,76,21,1,12


In [10]:
len(combine_transactions)

950516

In [11]:
products = pd.read_sql("select ProductKey, SKU from ProductDimension", conn)
stores = pd.read_sql("select StoreKey from StoreDimension", conn)

In [12]:
products.head()

Unnamed: 0,ProductKey,sku
0,1,42081001
1,2,42082001
2,3,42083001
3,4,42084001
4,5,42085001


#### Creating Sales Fact table (Transaction Level)

In [13]:
# Ensure StoreKey columns have the same data type
combine_transactions['StoreKey'] = combine_transactions['StoreKey'].astype(int)

fact_data = (
    combine_transactions
        .merge(products, left_on='sku', right_on="sku", how='left')
        .merge(stores, left_on='StoreKey', right_on="StoreKey", how='left')
        .dropna(subset=['ProductKey', 'StoreKey', 'DateKey'])
)

In [14]:
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey
0,2024-01-12,1,42671001,4.39,30,12,1,12,591
1,2024-01-12,1,42484001,1.42,72,21,1,12,404
2,2024-01-12,1,44140001,3.04,68,21,1,12,2055
3,2024-01-12,1,42215001,0.64,75,22,1,12,135
4,2024-01-12,1,42535001,4.39,76,21,1,12,455


### Calculate metrics


In [15]:
fact_data['QuantitySold'] = fact_data['cases_ordered'] * 12   #12 items per case
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,QuantitySold
0,2024-01-12,1,42671001,4.39,30,12,1,12,591,144
1,2024-01-12,1,42484001,1.42,72,21,1,12,404,252
2,2024-01-12,1,44140001,3.04,68,21,1,12,2055,252
3,2024-01-12,1,42215001,0.64,75,22,1,12,135,264
4,2024-01-12,1,42535001,4.39,76,21,1,12,455,252


In [17]:
fact_data['TotalDollarSales'] = (fact_data['salesPrice'] * fact_data['QuantitySold']).round(2)
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,QuantitySold,TotalDollarSales
0,2024-01-12,1,42671001,4.39,30,12,1,12,591,144,632.16
1,2024-01-12,1,42484001,1.42,72,21,1,12,404,252,357.84
2,2024-01-12,1,44140001,3.04,68,21,1,12,2055,252,766.08
3,2024-01-12,1,42215001,0.64,75,22,1,12,135,264,168.96
4,2024-01-12,1,42535001,4.39,76,21,1,12,455,252,1106.28


In [18]:
fact_data['TotalCostToStore'] = (
    fact_data['salesPrice'] * 0.7 * fact_data['QuantitySold']).round(2)
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,QuantitySold,TotalDollarSales,TotalCostToStore
0,2024-01-12,1,42671001,4.39,30,12,1,12,591,144,632.16,442.51
1,2024-01-12,1,42484001,1.42,72,21,1,12,404,252,357.84,250.49
2,2024-01-12,1,44140001,3.04,68,21,1,12,2055,252,766.08,536.26
3,2024-01-12,1,42215001,0.64,75,22,1,12,135,264,168.96,118.27
4,2024-01-12,1,42535001,4.39,76,21,1,12,455,252,1106.28,774.4


In [19]:
fact_data['GrossProfit'] = (fact_data['TotalDollarSales'] - fact_data['TotalCostToStore']).round(2)
fact_data.head()

Unnamed: 0,date,customer_number,sku,salesPrice,items_left,cases_ordered,StoreKey,DateKey,ProductKey,QuantitySold,TotalDollarSales,TotalCostToStore,GrossProfit
0,2024-01-12,1,42671001,4.39,30,12,1,12,591,144,632.16,442.51,189.65
1,2024-01-12,1,42484001,1.42,72,21,1,12,404,252,357.84,250.49,107.35
2,2024-01-12,1,44140001,3.04,68,21,1,12,2055,252,766.08,536.26,229.82
3,2024-01-12,1,42215001,0.64,75,22,1,12,135,264,168.96,118.27,50.69
4,2024-01-12,1,42535001,4.39,76,21,1,12,455,252,1106.28,774.4,331.88


#### Creating final fact table

In [20]:
sales_fact = fact_data[[
    'DateKey', 'customer_number', 'ProductKey', 'StoreKey',
    'QuantitySold', 'TotalDollarSales', 'TotalCostToStore', 'GrossProfit'
]].rename(columns={
    'customer_number': 'DailyCust#'
})

In [21]:
sales_fact.head()

Unnamed: 0,DateKey,DailyCust#,ProductKey,StoreKey,QuantitySold,TotalDollarSales,TotalCostToStore,GrossProfit
0,12,1,591,1,144,632.16,442.51,189.65
1,12,1,404,1,252,357.84,250.49,107.35
2,12,1,2055,1,252,766.08,536.26,229.82
3,12,1,135,1,264,168.96,118.27,50.69
4,12,1,455,1,252,1106.28,774.4,331.88


In [22]:
curr = conn.cursor()

curr.execute("DROP TABLE IF EXISTS SalesFact_TransactionLevel ")
curr.execute("""
    CREATE TABLE SalesFact_TransactionLevel (
        DateKey INT NOT NULL,
        "DailyCust#" INT NOT NULL,
        ProductKey INT NOT NULL,
        StoreKey INT NOT NULL,
        QuantitySold INT NOT NULL,
        TotalDollarSales REAL NOT NULL,
        TotalCostToStore REAL NOT NULL,
        GrossProfit REAL NOT NULL,
        PRIMARY KEY (DateKey, "DailyCust#", ProductKey),
        FOREIGN KEY (DateKey) REFERENCES DateDimension(DateKey),
        FOREIGN KEY (ProductKey) REFERENCES ProductDimension(ProductKey),
        FOREIGN KEY (StoreKey) REFERENCES StoreDimension(StoreKey)
    );
""")

<sqlite3.Cursor at 0x2031afed640>

In [23]:
sales_fact.to_sql("SalesFact_TransactionLevel", conn, if_exists='replace', index=False)

950516

In [24]:
curr.close()
conn.close()