In [None]:
import pandas as pd

# Load cleaned CSVs - comma-separated
raw_data = pd.read_csv('data/raw_data.csv')
incremental_data = pd.read_csv('data/incremental_data.csv')


In [16]:
def apply_transformations(df):
    df = df.copy()

    # Remove duplicates
    df = df.drop_duplicates()

    # Fill missing numeric values with median
    df.loc[:, 'quantity'] = pd.to_numeric(df['quantity'], errors='coerce')
    df.loc[:, 'unit_price'] = pd.to_numeric(df['unit_price'], errors='coerce')
    df.loc[:, 'quantity'] = df['quantity'].fillna(df['quantity'].median())
    df.loc[:, 'unit_price'] = df['unit_price'].fillna(df['unit_price'].median())

    # Add total_price column
    df.loc[:, 'total_price'] = df['quantity'] * df['unit_price']

    # Convert order_date safely
    df.loc[:, 'order_date'] = pd.to_datetime(df['order_date'].astype(str), errors='coerce', dayfirst=True)

    # Print debug line to confirm dtype
    print("Date dtype:", df['order_date'].dtype)

    # Only compute order_month if order_date is datetime64[ns]
    if pd.api.types.is_datetime64_any_dtype(df['order_date']):
        df.loc[:, 'order_month'] = df['order_date'].dt.month
    else:
        df.loc[:, 'order_month'] = None  # fallback if conversion fails

    # Fill missing regions
    df.loc[:, 'region'] = df['region'].fillna('Unknown')

    return df


In [17]:
transformed_full = apply_transformations(raw_data)
transformed_incremental = apply_transformations(incremental_data)

Date dtype: object
Date dtype: object
