#### Deliverable 6: Sales Fact Table (Daily Level) Implementation

In [1]:
import pandas as pd
import sqlite3 as lite

In [2]:
conn = lite.connect("store1.db")
output_file_path = "./output/"

In [3]:
# load dimension tables
date_dim = pd.read_sql("select DateKey, Date from DateDimension", conn)
products = pd.read_sql("select ProductKey, SKU from ProductDimension", conn)
stores = pd.read_sql("select StoreKey from StoreDimension", conn)

In [4]:
store_databases = {
    "Store 1": {
        "database_name": "store1.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 2": {
        "database_name": "store2.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 3": {
        "database_name": "store3.db",
        "transactions_table_name": "user_transactions",
    },
    "Store 4": {
        "database_name": "store4.db",
        "transactions_table_name": "sales_transactions"
    }
}

In [5]:
def load_all_transactions(store):
    conn = lite.connect(store['database_name'])
    table_name = store['transactions_table_name']

    transactions_query = f"select * from {table_name}"
    transaction_df = pd.read_sql(transactions_query, conn)

    conn.close()
    return transaction_df

In [6]:
all_daily_transactions = []

def new_func(transactions):
    daily_agg = transactions.groupby(['DateKey', "ProductKey", "StoreKey"]).agg({
        'QuantitySold': 'sum',
        'TotalDollarSales': 'sum',
        'TotalCostToStore': 'sum',
        'GrossProfit': 'sum',
        'customer_number': 'nunique'   # Count unique customers per day
    }).reset_index()
    
    # Format TotalDollarSales to 2 decimal points
    daily_agg['TotalDollarSales'] = daily_agg['TotalDollarSales'].round(2)
    daily_agg['TotalCostToStore'] = daily_agg['TotalCostToStore'].round(2)
    daily_agg['GrossProfit'] = daily_agg['GrossProfit'].round(2)
    
    return daily_agg

for store_name, config in store_databases.items():
    print(f"Processing {store_name}...")

    # Load transactions
    transactions = load_all_transactions(config)
    transactions['StoreKey'] = store_name.split(" ")[1]

    # Convert date to datetime and merge with date dimension
    transactions['temp_date'] = pd.to_datetime(transactions['date'])
    transactions = transactions.merge(
        date_dim,
        left_on="temp_date",
        right_on=pd.to_datetime(date_dim['Date']),
        how='left'
    ).drop(columns=['temp_date', 'Date'])

    # Ensure 'sku' is the same type as 'SKU'
    transactions['sku'] = pd.to_numeric(
        # Handles NaN if needed
        transactions['sku'], errors='coerce').astype('Int64')

    # Merge with product dimension
    transactions = transactions.merge(
        products,
        left_on='sku',
        right_on='sku',
        how="left"
    )

    # Calculate metrics
    transactions['QuantitySold'] = transactions['cases_ordered'] * 12
    transactions['TotalDollarSales'] = transactions['sales_price'] * \
        transactions['QuantitySold']

    # Assume the store buys the product at 70% of the retail price
    transactions['TotalCostToStore'] = transactions['sales_price'] * \
        0.7 * transactions['QuantitySold']

    transactions['GrossProfit'] = transactions['TotalDollarSales'] - \
        transactions['TotalCostToStore']

    # Group by DateKey, ProductKey, StoreKey to get daily aggregates
    daily_agg = new_func(transactions)

    # Rename columns to match the required schema
    daily_agg = daily_agg.rename(columns={
        'QuantitySold': '#SoldToday',
        'TotalDollarSales': 'SalesTotal',
        'TotalCostToStore': 'CostOfItemsSold',
        'customer_number': 'DailyCustomerCount'
    })

    all_daily_transactions.append(daily_agg)

Processing Store 1...
Processing Store 2...
Processing Store 3...
Processing Store 4...


In [7]:
# combine all stores data
daily_sales_fact = pd.concat(all_daily_transactions)

In [8]:
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
188689,94.0,1109,2,71640,50864.4,35605.08,15259.32,30
24789,13.0,580,3,12672,30286.08,21200.26,9085.82,31
250506,125.0,587,1,40176,309355.2,216548.64,92806.56,18
389207,195.0,25,3,171864,539652.96,377757.07,161895.89,33
9332,5.0,1302,4,240,1128.0,789.6,338.4,5


In [9]:
daily_sales_fact.shape

(2950945, 8)

In [10]:
daily_sales_fact['StoreKey'].value_counts()

StoreKey
4    739451
2    738235
1    737655
3    735604
Name: count, dtype: int64

##### Filter out any rows with missing keys

In [11]:
daily_sales_fact = daily_sales_fact.dropna(subset=['DateKey', "ProductKey", "StoreKey"])
daily_sales_fact.sample(5)

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
465701,231.0,1750,2,161124,380252.64,266176.85,114075.79,28
54216,27.0,1823,3,16896,80762.88,56534.02,24228.86,21
257291,128.0,1359,1,31752,93985.92,65790.14,28195.78,14
300237,149.0,1342,4,84900,316677.0,221673.9,95003.1,24
502634,249.0,1677,4,74760,275116.8,192581.76,82535.04,14


In [12]:
daily_sales_fact.shape

(2950945, 8)

##### convert keys to integers

In [13]:
daily_sales_fact.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2950945 entries, 0 to 739450
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   DateKey             float64
 1   ProductKey          int64  
 2   StoreKey            object 
 3   #SoldToday          int64  
 4   SalesTotal          float64
 5   CostOfItemsSold     float64
 6   GrossProfit         float64
 7   DailyCustomerCount  int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 202.6+ MB


In [14]:
daily_sales_fact['DateKey'] = daily_sales_fact['DateKey'].astype(int)
daily_sales_fact['ProductKey'] = daily_sales_fact['ProductKey'].astype(int)
daily_sales_fact['StoreKey'] = daily_sales_fact['StoreKey'].astype(int)

In [15]:
curr = conn.cursor()

curr.execute("Drop table if exists SalesFact_DailyLevel")
curr.execute("""
    CREATE TABLE SalesFact_DailyLevel (
        DateKey INT NOT NULL,
        ProductKey INT NOT NULL,
        StoreKey INT NOT NULL,
        "#SoldToday" INT NOT NULL,
        CostOfItemsSold REAL NOT NULL,
        SalesTotal REAL NOT NULL,
        GrossProfit REAL NOT NULL,
        DailyCustomerCount INT NOT NULL,
        PRIMARY KEY (DateKey, ProductKey, StoreKey),
        FOREIGN KEY (DateKey) REFERENCES DateDimension(DateKey),
        FOREIGN KEY (ProductKey) REFERENCES ProductDimension(ProductKey),
        FOREIGN KEY (StoreKey) REFERENCES StoreDimension(StoreKey)
    );
""")

<sqlite3.Cursor at 0x1caac2295c0>

In [16]:
daily_sales_fact.to_sql("SalesFact_DailyLevel", conn, if_exists='replace', index=False)

2950945

##### Test

In [17]:
sales_fact = pd.read_sql("select * from SalesFact_DailyLevel limit 10", conn)
sales_fact

Unnamed: 0,DateKey,ProductKey,StoreKey,#SoldToday,SalesTotal,CostOfItemsSold,GrossProfit,DailyCustomerCount
0,1,1,1,1440,3945.6,2761.92,1183.68,20
1,1,2,1,1008,1985.76,1390.03,595.73,14
2,1,3,1,1296,5689.44,3982.61,1706.83,18
3,1,4,1,1296,5689.44,3982.61,1706.83,18
4,1,5,1,1800,5922.0,4145.4,1776.6,25
5,1,6,1,936,2059.2,1441.44,617.76,13
6,1,7,1,936,11934.0,8353.8,3580.2,13
7,1,8,1,1080,5929.2,4150.44,1778.76,15
8,1,9,1,1296,5119.2,3583.44,1535.76,18
9,1,10,1,1296,6829.92,4780.94,2048.98,17


In [18]:
curr.close()
conn.close()

In [None]:
daily_sales_fact.to_csv(f"{output_file_path}SalesFact_DailyLevel.csv", index=False)