In [1]:
import pandas as pd

dtype_map = {
    'order_id': 'string',
    'customer_id': 'string',
    'order_date': 'string',
    'status': 'string',
    'payment_method': 'string',
    'shipping_address': 'string',
    'billing_address': 'string',
    'discount_amount': 'string',
    'tax_amount': 'string',
    'shipping_cost': 'string',
    'total_amount': 'string',
    'currency': 'string',
    'created_at': 'string',
    'updated_at': 'string',
    'subtotal': 'string'
}

df_orders = pd.read_csv('../data/raw/orders.csv', dtype=dtype_map)
df_orders.head()

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_date,status,payment_method,shipping_address,billing_address,discount_amount,tax_amount,shipping_cost,total_amount,currency,created_at,updated_at,subtotal
0,0,1,447917,2024-07-26T03:04:05.462241,completed,Digital Wallet,"49599 Wesley Burg Richardview, AZ 30649",USNV Morrison FPO AP 90901,0.0,,,25.96,USD,2024-07-26T03:04:05.462241,2024-07-27T03:04:05.462241,0
1,1,2,441379,2025-03-22T22:45:14.166543,Completed,Cash on Delivery,"3123 Alan Extension Port Andrea, MA 26926","95271 Russell Spurs Apt. 613 West Marcus, ND 5...",0.0,0.0,43.36,43.36,USD,2025-03-22T22:45:14.166543,,0
2,2,3,420585,,Refunded,PayPal,"97369 Brown Cliff Apt. 811 Travisland, ME 61083",9341 VAZQUEZ COMMON SUITE 320 NEW ANTHONYMOUTH...,,0.0,48.59,48.59,USD,2025-02-10T00:33:27.544335,,0
3,3,4,456731,2023-11-25T22:45:39.945645,Cancelled,Cash on Delivery,"61010 Adams Ridges Stacymouth, CT 64105","830 Weiss River Suite 861 Lake Nicolemouth, DC...",18.83,0.0,1.39,1.39,USD,,2023-12-02T22:45:39.945645,0
4,4,5,468123,2025-02-15T01:58:58.969479,Shipped,Digital Wallet,"495 Fisher Forest Apt. 665 New Scott, AZ 18199","0635 teresa streets apt. 953 robertton, fm 83052",25.38,,,25.25,USD,2025-02-15T01:58:58.969479,2025-02-19T01:58:58.969479,0


Vamos a descartar las columnas que son innecesarias para nuestro análisis.

In [2]:
df_orders.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_orders_clean = df_orders.set_index('order_id')

# DATES
for col in ['order_date', 'created_at', 'updated_at']:
    df_orders_clean[col] = df_orders_clean[col].str.strip()
    df_orders_clean[col] = pd.to_datetime(df_orders_clean[col], errors='coerce')

# INTEGERS
for col in ['customer_id']:
    df_orders_clean[col] = df_orders_clean[col].str.strip()
    df_orders_clean[col] = pd.to_numeric(df_orders_clean[col], errors='coerce').astype('Int64')

# FLOATS
for col in ['discount_amount', 'tax_amount', 'shipping_cost', 'total_amount', 'subtotal']:
    df_orders_clean[col] = df_orders_clean[col].str.strip()
    df_orders_clean[col] = pd.to_numeric(df_orders_clean[col], errors='coerce')

# STRINGS
for col in ['shipping_address', 'billing_address']:
    df_orders_clean[col] = df_orders_clean[col].str.strip()

# CATEGORIES 
status_vals = ['CANCELLED','COMPLETED','PROCESSING','REFUNDED','RETURNED','SHIPPED']
payment_method_vals = ['BANK TRANSFER','CASH ON DELIVERY','CREDIT CARD','DEBIT CARD','DIGITAL WALLET','PAYPAL']
currency_vals = ['USD','EUR','GBP','CAD']

cols = ['status','payment_method','currency']
valid_sets = [status_vals, payment_method_vals, currency_vals]

for col, valid in zip(cols, valid_sets):
    ser = df_orders_clean[col].str.strip().str.upper()
    df_orders_clean[col] = ser.where(ser.isin(valid)).astype('category')

df_orders_clean.head()

Unnamed: 0_level_0,customer_id,order_date,status,payment_method,shipping_address,billing_address,discount_amount,tax_amount,shipping_cost,total_amount,currency,created_at,updated_at,subtotal
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,447917,2024-07-26 03:04:05.462241,COMPLETED,DIGITAL WALLET,"49599 Wesley Burg Richardview, AZ 30649",USNV Morrison FPO AP 90901,0.0,,,25.96,USD,2024-07-26 03:04:05.462241,2024-07-27 03:04:05.462241,0
2,441379,2025-03-22 22:45:14.166543,COMPLETED,CASH ON DELIVERY,"3123 Alan Extension Port Andrea, MA 26926","95271 Russell Spurs Apt. 613 West Marcus, ND 5...",0.0,0.0,43.36,43.36,USD,2025-03-22 22:45:14.166543,NaT,0
3,420585,NaT,REFUNDED,PAYPAL,"97369 Brown Cliff Apt. 811 Travisland, ME 61083",9341 VAZQUEZ COMMON SUITE 320 NEW ANTHONYMOUTH...,,0.0,48.59,48.59,USD,2025-02-10 00:33:27.544335,NaT,0
4,456731,2023-11-25 22:45:39.945645,CANCELLED,CASH ON DELIVERY,"61010 Adams Ridges Stacymouth, CT 64105","830 Weiss River Suite 861 Lake Nicolemouth, DC...",18.83,0.0,1.39,1.39,USD,NaT,2023-12-02 22:45:39.945645,0
5,468123,2025-02-15 01:58:58.969479,SHIPPED,DIGITAL WALLET,"495 Fisher Forest Apt. 665 New Scott, AZ 18199","0635 teresa streets apt. 953 robertton, fm 83052",25.38,,,25.25,USD,2025-02-15 01:58:58.969479,2025-02-19 01:58:58.969479,0


Exportamos el dataset limpio para reutilizarlo.

In [4]:
df_orders_clean.to_pickle('../data/clean/orders.pkl')