#### Deliverable 6: Sales Fact Table (Daily Level) Implementation

In [34]:
import pandas as pd
import sqlite3 as lite
from datetime import datetime

In [35]:
conn = lite.connect("store1.db")

In [36]:
# load dimension tables
date_dim = pd.read_sql("select DateKey, Date from DateDimension", conn)
products = pd.read_sql("select ProductKey, SKU from ProductDimension", conn)
stores = pd.read_sql("select StoreKey from StoreDimension", conn)

In [37]:
store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name": "sales_transactions"
    }
}

In [38]:
def load_all_transactions(store):
    conn = lite.connect(store['database_name'])
    table_name = store['transactions_table_name']

    transactions_query = f"select * from {table_name}"
    transaction_df = pd.read_sql(transactions_query, conn)

    conn.close()
    return transaction_df

In [39]:
all_daily_transactions = []

for store_name, config in store_databases.items():
    print(f"Processing {store_name}...")

    # Load transactions
    transactions = load_all_transactions(config)
    transactions['StoreKey'] = store_name.split(" ")[1]

    # Convert date to datetime and merge with date dimension
    transactions['temp_date'] = pd.to_datetime(transactions['date'])
    transactions = transactions.merge(
        date_dim,
        left_on="temp_date",
        right_on=pd.to_datetime(date_dim['Date']),
        how='left'
    ).drop(columns=['temp_date', 'Date'])

    # Ensure 'sku' is the same type as 'SKU'
    transactions['sku'] = pd.to_numeric(
        # Handles NaN if needed
        transactions['sku'], errors='coerce').astype('Int64')

    # Merge with product dimension
    transactions = transactions.merge(
        products,
        left_on='sku',
        right_on='SKU',
        how="left"
    )

    # Calculate metrics
    transactions['QuantitySold'] = transactions['cases_ordered'] * 12
    transactions['TotalDollarSales'] = transactions['sales_price'] * \
        transactions['QuantitySold']

    # Assume the store buys the product at 70% of the retail price
    transactions['TotalCostToStore'] = transactions['sales_price'] * \
        0.7 * transactions['QuantitySold']

    transactions['GrossProfit'] = transactions['TotalDollarSales'] - \
        transactions['TotalCostToStore']

    # Group by DateKey, ProductKey, StoreKey to get daily aggregates
    daily_agg = transactions.groupby(['DateKey', "ProductKey", "StoreKey"]).agg({
        'QuantitySold': 'sum',
        'TotalDollarSales': 'sum',
        'TotalCostToStore': 'sum',
        'GrossProfit': 'sum',
        'customer_number': 'nunique'   # Count unique customers per day
    }).reset_index()

    # Rename columns to match the required schema
    daily_agg = daily_agg.rename(columns={
        'QuantitySold': '#SoldToday',
        'TotalDollarSales': 'SalesTotal',
        'TotalCostToStore': 'CostOfItemsSold',
        'customer_number': 'DailyCustomerCount'
    })

    all_daily_transactions.append(daily_agg)

Processing Store 1...
Processing Store 2...
Processing Store 3...
Processing Store 4...


In [40]:
# combine all stores data
daily_sales_fact = pd.concat(all_daily_transactions)

In [41]:
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
237452,104.0,1399,4,156,182.52,127.764,54.756,1
766562,336.0,309,1,57360,125618.4,87932.88,37685.52,10
140212,62.0,630,1,672,7754.88,5428.416,2326.464,4
303568,133.0,1497,2,4200,16716.0,11701.2,5014.8,7
629277,275.0,1221,4,136488,421747.92,295223.544,126524.376,22


In [42]:
daily_sales_fact.shape

(3348702, 8)

In [43]:
daily_sales_fact['StoreKey'].value_counts()

StoreKey
4    839003
2    837784
1    837207
3    834708
Name: count, dtype: int64

##### Filter out any rows with missing keys

In [44]:
daily_sales_fact = daily_sales_fact.dropna(subset=['DateKey', "ProductKey", "StoreKey"])
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
654992,286.0,1766,4,142968,1160900.16,812630.112,348270.048,23
428803,189.0,960,3,8844,26532.0,18572.4,7959.6,11
453669,198.0,2202,4,600,828.0,579.6,248.4,2
200177,89.0,1145,3,78012,350273.88,245191.716,105082.164,33
769241,336.0,1307,4,133704,419830.56,293881.392,125949.168,18


In [45]:
daily_sales_fact.shape

(3348702, 8)

##### convert keys to integers

In [46]:
daily_sales_fact.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3348702 entries, 0 to 839002
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   DateKey             float64
 1   ProductKey          int64  
 2   StoreKey            object 
 3   #SoldToday          int64  
 4   SalesTotal          float64
 5   CostOfItemsSold     float64
 6   GrossProfit         float64
 7   DailyCustomerCount  int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 229.9+ MB


In [47]:
daily_sales_fact['DateKey'] = daily_sales_fact['DateKey'].astype(int)
daily_sales_fact['ProductKey'] = daily_sales_fact['ProductKey'].astype(int)
daily_sales_fact['StoreKey'] = daily_sales_fact['StoreKey'].astype(int)

In [48]:
curr = conn.cursor()

curr.execute("Drop table if exists SalesFact_DailyLevel")
curr.execute("""
    CREATE TABLE SalesFact_DailyLevel (
        DateKey INT NOT NULL,
        ProductKey INT NOT NULL,
        StoreKey INT NOT NULL,
        "#SoldToday" INT NOT NULL,
        CostOfItemsSold REAL NOT NULL,
        SalesTotal REAL NOT NULL,
        GrossProfit REAL NOT NULL,
        DailyCustomerCount INT NOT NULL,
        PRIMARY KEY (DateKey, ProductKey, StoreKey),
        FOREIGN KEY (DateKey) REFERENCES DateDimension(DateKey),
        FOREIGN KEY (ProductKey) REFERENCES ProductDimension(ProductKey),
        FOREIGN KEY (StoreKey) REFERENCES StoreDimension(StoreKey)
    );
""")

<sqlite3.Cursor at 0x1850b1e3540>

In [49]:
daily_sales_fact.to_sql("SalesFact_DailyLevel", conn, if_exists='replace', index=False)

3348702

##### Test

In [50]:
sales_fact = pd.read_sql("select * from SalesFact_DailyLevel limit 10", conn)
sales_fact

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
0,1,1,1,1440,3945.6,2761.92,1183.68,20
1,1,2,1,1008,1985.76,1390.032,595.728,14
2,1,3,1,1296,5689.44,3982.608,1706.832,18
3,1,4,1,1296,5689.44,3982.608,1706.832,18
4,1,5,1,1296,5689.44,3982.608,1706.832,18
5,1,6,1,1800,5922.0,4145.4,1776.6,25
6,1,7,1,936,2059.2,1441.44,617.76,13
7,1,8,1,936,11934.0,8353.8,3580.2,13
8,1,9,1,1080,5929.2,4150.44,1778.76,15
9,1,10,1,1296,5119.2,3583.44,1535.76,18


In [51]:
curr.close()
conn.close()

In [52]:
daily_sales_fact.to_csv("SalesFact_DailyLevel.csv", index=False)