In [33]:
import pandas as pd
from dataclasses import dataclass
from typing import Optional
import re
from datetime import datetime
from dataclasses import dataclass
from collections import defaultdict
from dataclasses import field


Load raw CSV, display first 3 rows

Here I have used a Synthetic data in which the csv file contains 550 rows of mock jewelry_dataset which contains feilds such as 'date', 'customer_id', 'product', 'price', 'quantity', 'coupon_code', 'shipping_city'.

In [34]:
# Load the CSV using pandas
df = pd.read_csv("data/jewelry_dataset.csv")
print("Header:", df.columns.tolist())
print(df.head(3))

Header: ['date', 'customer_id', 'product', 'price', 'quantity', 'coupon_code', 'shipping_city']
         date                           customer_id   product    price  \
0  2024-10-08  e79f819f-ed09-472d-981b-279a33c78006      Ring  4724.35   
1  2024-07-16  1934841e-524e-48cc-85d8-0203d03f20aa  Earrings  4743.77   
2  2025-03-12  02c36dba-7500-4c75-8162-b9c551225a11      Ring   464.57   

   quantity coupon_code shipping_city  
0         5        NONE      San Jose  
1         4    FREESHIP        Dallas  
2         2        NONE   Los Angeles  


Pick the Right Container

Among the choices I think for storing each row, we had dictionaries, namedtuples and classes.
I have choose a class because to cover logic like cleaning, changing and calculating totals.
As a result, transaction objects become more flexible and can be used again.


Transaction Class

# Import Packages

# Data Class

In [35]:
@dataclass
class Transaction:
    date: str
    customer_id: str
    product: str
    price: Optional[float]
    quantity: int
    coupon_code: Optional[str]
    shipping_city: str
    percentage_of_discount: float = field(default=0.0)  # Derived field, set later
    days_since_purchase: Optional[int] = field(default=None)  # Derived field, set later


    def price_data_clean(self):
        try:
            self.price = float(self.price)
            if self.price < 0:
                self.price = 0.00
        except:
            self.price = 0.00

    def price_total_value(self):
        return self.price * self.quantity if self.price else 0

# Bulk Loader

In [36]:
# Function to load and return cleaned transactions
def load_data_transactions() -> list[Transaction]:
    df = pd.read_csv('data/jewelry_dataset.csv')
    transactions_data = [
        Transaction(
            date=row["date"],
            customer_id=row["customer_id"],
            product=row["product"],
            price=row["price"],
            quantity=row["quantity"],
            coupon_code=row["coupon_code"],
            shipping_city=row["shipping_city"]
        ) for _, row in df.iterrows()
    ]
    print(transactions_data)
    return transactions_data

# Statistic Profiling: Minimum, Mean, Maximum price and unique city count (set)

In [37]:

def transactions_data_statstic_profile(list_of_transactions: list) -> None:
    valid_prices = [t.price for t in list_of_transactions if isinstance(t.price, (int, float)) and t.price is not None]
    unique_cities = len(set(t.shipping_city for t in list_of_transactions))
    print("Transaction Price Statstic:")
    print(f"Minimum: {min(valid_prices):.2f}")
    print(f"Maximum: {max(valid_prices):.2f}")
    print(f"Mean: {sum(valid_prices)/len(valid_prices):.2f}")
    print(f"Shipping Cities Unique Values: {unique_cities}\n")

#  Insert Grime data and Spot the Grime

In [38]:
def insert_dirty_values(list_of_transactions: list) -> None:
    if len(list_of_transactions) >= 30:
        list_of_transactions[10].price = "N/A"
        list_of_transactions[21].price = -130
        list_of_transactions[12].price = "?adv"
        list_of_transactions[28].price = -90
    print("Inserted 3 dirty values for demo.\n")

# Cleaning dirty values

In [39]:
def clean_transaction_dataframe(list_of_transactions: list) -> None:
    before = sum(1 for t in list_of_transactions if not isinstance(t.price, float))
    for t in list_of_transactions:
        t.price_data_clean()
    after = sum(1 for t in list_of_transactions if not isinstance(t.price, float))
    print(f"Cleaned transactions Count — Dirt Cell Count (before): {before}, Dirt Cell Count (after): {after}\n")

# Transformations

In [40]:
def transform_transactions(list_of_transactions: list) -> None:
    for t in list_of_transactions:
        if t.coupon_code and re.match(r"SAVE(\d+)", t.coupon_code):
            t.percentage_of_discount = int(re.findall(r"\d+", t.coupon_code)[0]) / 100
        else:
            t.percentage_of_discount = 0.0
    print("Transformation applied on coupon_code → percentage_of_discount.\n")

# Feature Engineering

In [41]:
def feature_engineer(list_of_transactions: list) -> None:
    today = datetime.today()
    for t in list_of_transactions:
        try:
            purchase_date = pd.to_datetime(t.date)
            t.days_since_purchase = (today - purchase_date).days
        except Exception:
            t.days_since_purchase = None  # fallback if date is invalid
    print("Added 'days_since_purchase' to each transaction.\n")

# Mini-Aggregation

In [None]:
def aggregate_revenue_by_city(list_of_transactions: list) -> dict:
    revenue_by_city = defaultdict(float)
    for t in list_of_transactions:
        if isinstance(t.price, float):
            revenue_by_city[t.shipping_city] += t.price_total_value()
    print("Revenue per city calculated.\n")
    return dict(revenue_by_city)


In [43]:


def run_pipeline():
    print(" Running Transaction Data Pipeline\n")
    transactionsDataFrame = load_data_transactions()
    
    transactions_data_statstic_profile(transactionsDataFrame)
    
    insert_dirty_values(transactionsDataFrame)
    
    clean_transaction_dataframe(transactionsDataFrame)

    transform_transactions(transactionsDataFrame)

    feature_engineer(transactionsDataFrame)

    revenue_by_city = aggregate_revenue_by_city(transactionsDataFrame)
    print(f"revenue_by_city: {revenue_by_city}\n")
    
  


In [44]:
run_pipeline()

 Running Transaction Data Pipeline

[Transaction(date='2024-10-08', customer_id='e79f819f-ed09-472d-981b-279a33c78006', product='Ring', price=4724.35, quantity=5, coupon_code='NONE', shipping_city='San Jose', percentage_of_discount=0.0, days_since_purchase=None), Transaction(date='2024-07-16', customer_id='1934841e-524e-48cc-85d8-0203d03f20aa', product='Earrings', price=4743.77, quantity=4, coupon_code='FREESHIP', shipping_city='Dallas', percentage_of_discount=0.0, days_since_purchase=None), Transaction(date='2025-03-12', customer_id='02c36dba-7500-4c75-8162-b9c551225a11', product='Ring', price=464.57, quantity=2, coupon_code='NONE', shipping_city='Los Angeles', percentage_of_discount=0.0, days_since_purchase=None), Transaction(date='2024-11-13', customer_id='681cbfd9-23bb-4ed1-83d0-2b619746279f', product='Earrings', price=3956.67, quantity=2, coupon_code='SAVE10', shipping_city='San Diego', percentage_of_discount=0.0, days_since_purchase=None), Transaction(date='2024-06-22', customer_

AttributeError: 'Transaction' object has no attribute 'total'

Data Source 



