# ETL

## Modules

In [17]:
import pandas as pd
from datetime import datetime, timedelta

## Data Extraction

### Data Loading

In [14]:
def load_xlsx_data(file):
    df = pd.read_excel(file)

    return df

### Data Cleaning

In [4]:
def check_null_values(df):
    df.is_null().sum()
    null_counts = null_counts[null_counts > 0]
    
    if null_counts.empty:
        print("✅ No null values found in the dataframe.")
    else:
        print("⚠️ Columns with null values:")
        for col, count in null_counts.items():
            print(f"{col}: {count}")

In [None]:
def check_duplicates(df):
    

In [5]:
def remove_outliers(df):
    df = df.copy()
    
    df = df[(df["Quantity"] >= 0) & (df["UnitPrice"] > 0)]

    return df

In [6]:
def filter_last_year(df):
    df = df.copy()

    current_date = datetime(2025, 8, 12)
    one_year_ago = current_date - timedelta(365)

    df = df[df["InvoiceDate"] >= one_year_ago]

In [None]:
# In the main function for the full ETL have a manual conversion of
# columns and their respective data types

### Data Enrichment

In [7]:
def create_total_sales(df):
    df = df.copy()
    df['TotalSales'] = df['Quantity'] * df['UnitPrice']

    return df

In [8]:
def group_by_customer(df):
    df = df.copy()

    df = df.groupby("CustomerID").reset_index()

## ETL Pipeline

In [24]:
def etl_pipeline():
    raw_df = load_xlsx_data("../../data/raw/Online Retail.xlsx")

    return raw_df

In [25]:
etl_df = etl_pipeline()

In [27]:
etl_df["InvoiceNo"] = etl_df["InvoiceNo"].astype(str)
etl_df["StockCode"] = etl_df["StockCode"].astype(str)
etl_df["Description"] = etl_df["Description"].astype(str)
etl_df["Quantity"] = pd.to_numeric(etl_df["Quantity"], errors="coerce").astype("Int64")  # allows NaN
etl_df["InvoiceDate"] = pd.to_datetime(etl_df["InvoiceDate"], errors="coerce")
etl_df["UnitPrice"] = pd.to_numeric(etl_df["UnitPrice"], errors="coerce").astype(float)
etl_df["CustomerID"] = etl_df["CustomerID"].astype(str)
etl_df["Country"] = etl_df["Country"].astype(str)

In [29]:
sales_df = create_total_sales(etl_df)

In [30]:
sales_df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                Int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
TotalSales            Float64
dtype: object

In [26]:
etl_df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
