#### Deliverable 6: Sales Fact Table (Daily Level) Implementation

In [1]:
import pandas as pd
import sqlite3 as lite

In [2]:
conn = lite.connect("store1.db")
output_file_path = "./output/"

In [3]:
# load dimension tables
date_dim = pd.read_sql("select DateKey, Date from DateDimension", conn)
products = pd.read_sql("select ProductKey, SKU from ProductDimension", conn)
stores = pd.read_sql("select StoreKey from StoreDimension", conn)

In [4]:
store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "sales_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name": "sales_transactions"
    }
}

In [5]:
def load_all_transactions(store):
    conn = lite.connect(store['database_name'])
    table_name = store['transactions_table_name']

    transactions_query = f"select * from {table_name}"
    transaction_df = pd.read_sql(transactions_query, conn)

    conn.close()
    return transaction_df

In [7]:
all_daily_transactions = []

def new_func(transactions):
    daily_agg = transactions.groupby(['DateKey', "ProductKey", "StoreKey"]).agg({
        'QuantitySold': 'sum',
        'TotalDollarSales': 'sum',
        'TotalCostToStore': 'sum',
        'GrossProfit': 'sum',
        'customer_number': 'nunique'   # Count unique customers per day
    }).reset_index()
    
    # Format TotalDollarSales to 2 decimal points
    daily_agg['TotalDollarSales'] = daily_agg['TotalDollarSales'].round(2)
    daily_agg['TotalCostToStore'] = daily_agg['TotalCostToStore'].round(2)
    daily_agg['GrossProfit'] = daily_agg['GrossProfit'].round(2)
    
    return daily_agg

for store_name, config in store_databases.items():
    print(f"Processing {store_name}...")

    # Load transactions
    transactions = load_all_transactions(config)
    transactions['StoreKey'] = store_name.split(" ")[1]

    # Convert date to datetime and merge with date dimension
    transactions['temp_date'] = pd.to_datetime(transactions['date'])
    transactions = transactions.merge(
        date_dim,
        left_on="temp_date",
        right_on=pd.to_datetime(date_dim['Date']),
        how='left'
    ).drop(columns=['temp_date', 'Date'])

    # Ensure 'sku' is the same type as 'SKU'
    transactions['sku'] = pd.to_numeric(
        # Handles NaN if needed
        transactions['sku'], errors='coerce').astype('Int64')

    # Merge with product dimension
    transactions = transactions.merge(
        products,
        left_on='sku',
        right_on='sku',
        how="left"
    )

    # Calculate metrics
    transactions['QuantitySold'] = transactions['cases_ordered'] * 12
    transactions['TotalDollarSales'] = transactions['salesPrice'] * \
        transactions['QuantitySold']

    # Assume the store buys the product at 70% of the retail price
    transactions['TotalCostToStore'] = transactions['salesPrice'] * \
        0.7 * transactions['QuantitySold']

    transactions['GrossProfit'] = transactions['TotalDollarSales'] - \
        transactions['TotalCostToStore']

    # Group by DateKey, ProductKey, StoreKey to get daily aggregates
    daily_agg = new_func(transactions)

    # Rename columns to match the required schema
    daily_agg = daily_agg.rename(columns={
        'QuantitySold': '#SoldToday',
        'TotalDollarSales': 'SalesTotal',
        'TotalCostToStore': 'CostOfItemsSold',
        'customer_number': 'DailyCustomerCount'
    })

    all_daily_transactions.append(daily_agg)

Processing Store 1...
Processing Store 2...
Processing Store 3...
Processing Store 4...


In [8]:
# combine all stores data
daily_sales_fact = pd.concat(all_daily_transactions)

In [9]:
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
321101,160,714,1,71700,157023.0,109916.1,47106.9,24
130483,65,1300,4,576,2338.56,1636.99,701.57,2
177883,89,166,4,44988,78279.12,54795.38,23483.74,23
26917,14,740,1,4800,14784.0,10348.8,4435.2,16
61458,31,925,3,25536,51838.08,36286.66,15551.42,28


In [10]:
daily_sales_fact.shape

(1469153, 8)

In [11]:
daily_sales_fact['StoreKey'].value_counts()

StoreKey
4    367590
3    367468
2    367330
1    366765
Name: count, dtype: int64

##### Filter out any rows with missing keys

In [12]:
daily_sales_fact = daily_sales_fact.dropna(subset=['DateKey', "ProductKey", "StoreKey"])
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
230000,114,1918,3,87480,181958.4,127370.88,54587.52,27
136127,68,848,4,32004,122895.36,86026.75,36868.61,21
180731,90,1093,3,62700,187473.0,131231.1,56241.9,25
347515,173,911,1,48960,30844.8,21591.36,9253.44,16
59832,30,1236,4,120,165.6,115.92,49.68,2


In [13]:
daily_sales_fact.shape

(1469153, 8)

##### convert keys to integers

In [14]:
daily_sales_fact.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1469153 entries, 0 to 367589
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   DateKey             1469153 non-null  int64  
 1   ProductKey          1469153 non-null  int64  
 2   StoreKey            1469153 non-null  object 
 3   #SoldToday          1469153 non-null  int64  
 4   SalesTotal          1469153 non-null  float64
 5   CostOfItemsSold     1469153 non-null  float64
 6   GrossProfit         1469153 non-null  float64
 7   DailyCustomerCount  1469153 non-null  int64  
dtypes: float64(3), int64(4), object(1)
memory usage: 100.9+ MB


In [15]:
daily_sales_fact['DateKey'] = daily_sales_fact['DateKey'].astype(int)
daily_sales_fact['ProductKey'] = daily_sales_fact['ProductKey'].astype(int)
daily_sales_fact['StoreKey'] = daily_sales_fact['StoreKey'].astype(int)

In [16]:
curr = conn.cursor()

curr.execute("Drop table if exists SalesFact_DailyLevel")
curr.execute("""
    CREATE TABLE SalesFact_DailyLevel (
        DateKey INT NOT NULL,
        ProductKey INT NOT NULL,
        StoreKey INT NOT NULL,
        "#SoldToday" INT NOT NULL,
        CostOfItemsSold REAL NOT NULL,
        SalesTotal REAL NOT NULL,
        GrossProfit REAL NOT NULL,
        DailyCustomerCount INT NOT NULL,
        PRIMARY KEY (DateKey, ProductKey, StoreKey),
        FOREIGN KEY (DateKey) REFERENCES DateDimension(DateKey),
        FOREIGN KEY (ProductKey) REFERENCES ProductDimension(ProductKey),
        FOREIGN KEY (StoreKey) REFERENCES StoreDimension(StoreKey)
    );
""")

<sqlite3.Cursor at 0x16c2f96a4c0>

In [17]:
daily_sales_fact.to_sql("SalesFact_DailyLevel", conn, if_exists='replace', index=False)

1469153

##### Test

In [18]:
sales_fact = pd.read_sql("select * from SalesFact_DailyLevel limit 10", conn)
sales_fact

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
0,1,1,1,1008,2761.92,1933.34,828.58,12
1,1,2,1,1512,2978.64,2085.05,893.59,18
2,1,3,1,924,4056.36,2839.45,1216.91,11
3,1,4,1,924,4056.36,2839.45,1216.91,11
4,1,5,1,1092,3592.68,2514.88,1077.8,13
5,1,6,1,1428,3141.6,2199.12,942.48,17
6,1,7,1,1512,19278.0,13494.6,5783.4,18
7,1,8,1,1512,8300.88,5810.62,2490.26,18
8,1,9,1,1176,4645.2,3251.64,1393.56,14
9,1,10,1,1260,6640.2,4648.14,1992.06,15


In [19]:
curr.close()
conn.close()

In [None]:
daily_sales_fact.to_csv(f"{output_file_path}SalesFact_DailyLevel.csv", index=False)