# ETL

## Modules

In [1]:
import pandas as pd
from datetime import datetime, timedelta

## Data Extraction

### DataFrame Logging

In [2]:
def row_logger(df, stage_name):
    print(f"Stage: {stage_name} | Rows: {len(df)}")

### Data Loading

In [3]:
def load_xlsx_data(file):
    df = pd.read_excel(file)

    return df

### Data Cleaning

In [14]:
def check_null_values(df):
    null_counts = df.isnull().sum()
    null_counts = null_counts[null_counts > 0]
    
    if null_counts.empty:
        print("No null values found in the dataframe.")
        return False
    else:
        print("Columns with null values:")
        for col, count in null_counts.items():
            print(f"{col}: {count}")
        return True

In [5]:
def check_duplicates(df):
    duplicates = df.duplicated()
    
    if duplicates.any():
        print("Duplicates found!")
        return True
    else:
        print("No duplicates found.")
        return False

In [6]:
def drop_duplicates(df):
    row_logger(df, "Before dropping duplicates")
    
    df_cleaned = df.drop_duplicates()
    
    row_logger(df_cleaned, "After dropping duplicates")
    
    return df_cleaned

In [7]:
def remove_outliers(df):
    row_logger(df, "Before removing outliers")
    
    outliers = df.copy()
    
    outliers = outliers[(outliers["Quantity"] >= 0) & (outliers["UnitPrice"] > 0)]

    row_logger(outliers, "After removing outliers")

    return outliers

In [20]:
def datatype_conversion(df):
    row_logger(df, "Before datatype conversion")
    
    converted = df.copy()

    print(f"These are the columns to convert:\n")
    print(converted.columns.to_list())

    converted["InvoiceNo"] = converted["InvoiceNo"].astype(str)
    converted["StockCode"] = converted["StockCode"].astype(str)
    converted["Description"] = converted["Description"].astype(str)
    converted["Quantity"] = pd.to_numeric(converted["Quantity"], errors="coerce").astype("Int64")  # allows NaN
    converted["InvoiceDate"] = pd.to_datetime(converted["InvoiceDate"], errors="coerce")
    converted["UnitPrice"] = pd.to_numeric(converted["UnitPrice"], errors="coerce").astype(float)
    converted["CustomerID"] = converted["CustomerID"].astype(str)
    converted["Country"] = converted["Country"].astype(str)

    print(converted.dtypes)

    row_logger(converted, "After datatype conversion")

    return converted

### Data Enrichment

In [9]:
def create_total_sales(df):
    row_logger(df, "Before creating TotalSales column")
    
    total_sales = df.copy()
    total_sales['TotalSales'] = total_sales['Quantity'] * total_sales['UnitPrice']

    row_logger(total_sales, "After creating TotalSales column")

    return total_sales

In [17]:
def filter_last_year(df):
    row_logger(df, "Before filtering to the last year sales")
    
    latest_date = df['InvoiceDate'].max()
    one_year_ago = latest_date - timedelta(days=365)
    
    filtered = df[df["InvoiceDate"] >= one_year_ago]

    row_logger(filtered, "After filtering to the last year sales")
    
    return filtered

In [22]:
def aggregate_customer_summary(df):
    if 'CustomerID' not in df or 'TotalSales' not in df or 'Quantity' not in df or 'InvoiceDate' not in df:
        raise ValueError("The DataFrame must contain 'CustomerID', 'TotalSales', 'Quantity', and 'InvoiceDate' columns.")

    customer_summary = df.groupby('CustomerID').agg(
        total_sales=('TotalSales', 'sum'),
        total_quantity=('Quantity', 'sum'),
        avg_sales_per_transaction=('TotalSales', 'mean'),
        num_transactions=('InvoiceNo', 'nunique'),  # Count distinct transactions
        first_purchase_date=('InvoiceDate', 'min'),
        last_purchase_date=('InvoiceDate', 'max')
    ).reset_index()
    
    return customer_summary

### Data Warehousing

In [None]:
def create_customer_dimension(df):
    """
    Creates a customer dimension table.
    This includes CustomerID and other customer-related fields.
    """
    # Grouping by CustomerID to get unique customer data
    customer_dim = df[['CustomerID', 'Country']].drop_duplicates()

    # Optionally, you can add more fields like customer name, address if available
    # For now, we're keeping it simple with just CustomerID and Country
    
    return customer_dim

In [None]:
def create_date_dimension(df):
    """
    Creates a date dimension table.
    This table includes detailed date information such as day, month, quarter, and year.
    """
    # Create a Date Dimension table from the InvoiceDate column
    date_dim = pd.DataFrame()

    # Extract date parts
    date_dim['Date'] = df['InvoiceDate'].dt.date
    date_dim['Year'] = df['InvoiceDate'].dt.year
    date_dim['Month'] = df['InvoiceDate'].dt.month
    date_dim['Day'] = df['InvoiceDate'].dt.day
    date_dim['Weekday'] = df['InvoiceDate'].dt.weekday
    date_dim['Quarter'] = df['InvoiceDate'].dt.quarter

    # Drop duplicates to make sure each date is unique
    date_dim = date_dim.drop_duplicates()

    return date_dim

In [None]:
def create_product_dimension(df):
    """
    Creates a product dimension table.
    This table includes StockCode, Description, and possibly other product-related details.
    """
    product_dim = df[['StockCode', 'Description']].drop_duplicates()
    
    return product_dim

In [None]:
def create_sales_fact(df):
    """
    Creates a sales fact table.
    This table includes the InvoiceNo, CustomerID, Quantity, UnitPrice, and InvoiceDate.
    """
    # Calculate the Total Sales per transaction
    df['TotalSales'] = df['Quantity'] * df['UnitPrice']
    
    # Create the Sales Fact table with relevant columns
    sales_fact = df[['InvoiceNo', 'CustomerID', 'Quantity', 'UnitPrice', 'TotalSales', 'InvoiceDate']]
    
    return sales_fact

## ETL Pipeline

In [26]:
def etl_pipeline():
    # Load the Excel file into the raw dataframe
    raw_df = load_xlsx_data("../../data/raw/Online Retail.xlsx")

    # Perform datatype conversion for all the columns in the dataframe
    df_converted = datatype_conversion(raw_df)

    # Original Intent: Filter sales data to the last year assuming that the day is August 12 2025
    # This is done immediately after datatype conversion since
    # 1. Only this data will be saved to the database
    # 2. It will minimise computational complexity
    # 3. The last year is the time range of interest
    # The functionality of the filter_last_year() function has been modified
    # It filters for the last year starting at the last InvoiceDate
    # This is done because there is no data from August 12 2024 to August 12 2025
    # Actual Intent: Filter sales data for the last year starting at the last InvoiceDate
    df_filtered = filter_last_year(df_converted)

    # Check for missing values to cross validate the UCI ML statistics
    # There should be no missing values
    check_null_values(df_filtered)
    
    # Handle duplicates by dropping them if they exist
    if check_duplicates(df_filtered):
        df_no_duplicates = drop_duplicates(df_filtered)
    else:
        df_no_duplicates = df_filtered

    # Handle outliers in the Quantity and UnitPrice columns
    df_no_outliers = remove_outliers(df_no_duplicates)

    # Compute the TotalSales Column
    df_revenue = create_total_sales(df_no_outliers)

    # Group data by the CustomerID
    df_customer = aggregate_customer_summary(df_revenue)

    return raw_df, df_customer

In [27]:
original_df, customer_df = etl_pipeline()

Stage: Before datatype conversion | Rows: 541909
These are the columns to convert:

['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
InvoiceNo              object
StockCode              object
Description            object
Quantity                Int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object
Stage: After datatype conversion | Rows: 541909
Stage: Before filtering to the last year sales | Rows: 541909
Stage: After filtering to the last year sales | Rows: 521669
No null values found in the dataframe.
Duplicates found!
Stage: Before dropping duplicates | Rows: 521669
Stage: After dropping duplicates | Rows: 516717
Stage: Before removing outliers | Rows: 516717
Stage: After removing outliers | Rows: 505352
Stage: Before creating TotalSales column | Rows: 505352
Stage: After creating TotalSales column | Rows: 505352


In [28]:
original_df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [29]:
customer_df.head(5)

Unnamed: 0,CustomerID,total_sales,total_quantity,avg_sales_per_transaction,num_transactions,first_purchase_date,last_purchase_date
0,12346.0,77183.6,74215,77183.6,1,2011-01-18 10:01:00,2011-01-18 10:01:00
1,12347.0,3598.21,2139,23.829205,6,2011-01-26 14:30:00,2011-12-07 15:52:00
2,12348.0,1797.24,2341,57.975484,4,2010-12-16 19:09:00,2011-09-25 13:13:00
3,12349.0,1757.55,631,24.076027,1,2011-11-21 09:51:00,2011-11-21 09:51:00
4,12350.0,334.4,197,19.670588,1,2011-02-02 16:01:00,2011-02-02 16:01:00
