# Stats Compass Core - Column Operations

This notebook demonstrates the column manipulation tools:
- `drop_columns` - Remove columns from a DataFrame
- `rename_columns` - Rename columns using a mapping
- `add_column` - Add new columns with expressions or constant values

In [1]:
# Setup
import numpy as np
import pandas as pd

from stats_compass_core import DataFrameState, registry

# Initialize state
state = DataFrameState()
registry.auto_discover()
print("Stats Compass Core ready!")

Stats Compass Core ready!


In [2]:
# Create sample e-commerce order data
np.random.seed(42)
n_orders = 100

orders = pd.DataFrame({
    'order_id': range(1001, 1001 + n_orders),
    'customer_id': np.random.randint(1, 21, n_orders),
    'product_name': np.random.choice(['Widget', 'Gadget', 'Gizmo', 'Doohickey'], n_orders),
    'unit_price': np.random.choice([29.99, 49.99, 19.99, 99.99], n_orders),
    'quantity': np.random.randint(1, 5, n_orders),
    'discount_pct': np.random.choice([0, 5, 10, 15, 20], n_orders),
    'order_date': pd.date_range('2024-01-01', periods=n_orders, freq='6h'),
    'internal_code': np.random.randint(10000, 99999, n_orders),
    'legacy_flag': 'N',
})

state.set_dataframe(orders, name='orders', operation='create')
print(f"Created orders DataFrame: {orders.shape}")
orders.head()

Created orders DataFrame: (100, 9)


Unnamed: 0,order_id,customer_id,product_name,unit_price,quantity,discount_pct,order_date,internal_code,legacy_flag
0,1001,7,Gizmo,99.99,2,10,2024-01-01 00:00:00,26896,N
1,1002,20,Doohickey,99.99,4,15,2024-01-01 06:00:00,56175,N
2,1003,15,Doohickey,29.99,1,10,2024-01-01 12:00:00,17805,N
3,1004,11,Gadget,99.99,4,0,2024-01-01 18:00:00,15237,N
4,1005,8,Gizmo,29.99,4,0,2024-01-02 00:00:00,30056,N


---
## 1. Drop Columns

Remove unwanted columns from a DataFrame.

In [3]:
from stats_compass_core.data.drop_columns import DropColumnsInput, drop_columns

In [4]:
# Drop internal columns that aren't needed for analysis
result = drop_columns(state, DropColumnsInput(
    dataframe_name='orders',
    columns=['internal_code', 'legacy_flag'],
    save_as='orders_clean'
))

print(result.message)
print(f"\nColumns before: {list(state.get_dataframe('orders').columns)}")
print(f"Columns after:  {list(state.get_dataframe('orders_clean').columns)}")

Dropped 2 column(s): ['internal_code', 'legacy_flag']

Columns before: ['order_id', 'customer_id', 'product_name', 'unit_price', 'quantity', 'discount_pct', 'order_date', 'internal_code', 'legacy_flag']
Columns after:  ['order_id', 'customer_id', 'product_name', 'unit_price', 'quantity', 'discount_pct', 'order_date']


In [5]:
# Using errors='ignore' to skip columns that don't exist
result = drop_columns(state, DropColumnsInput(
    dataframe_name='orders_clean',
    columns=['nonexistent_column', 'another_missing'],
    errors='ignore'  # Won't raise error
))

print(result.message)

Dropped 0 column(s): []


---
## 2. Rename Columns

Rename columns to more descriptive or standardized names.

In [6]:
from stats_compass_core.data.rename_columns import RenameColumnsInput, rename_columns

In [7]:
# Rename columns to snake_case and more descriptive names
result = rename_columns(state, RenameColumnsInput(
    dataframe_name='orders_clean',
    mapping={
        'unit_price': 'price',
        'discount_pct': 'discount_percent',
    },
    save_as='orders_renamed'
))

print(result.message)
state.get_dataframe('orders_renamed').head()

Renamed 2 column(s): 'unit_price' → 'price', 'discount_pct' → 'discount_percent'


Unnamed: 0,order_id,customer_id,product_name,price,quantity,discount_percent,order_date
0,1001,7,Gizmo,99.99,2,10,2024-01-01 00:00:00
1,1002,20,Doohickey,99.99,4,15,2024-01-01 06:00:00
2,1003,15,Doohickey,29.99,1,10,2024-01-01 12:00:00
3,1004,11,Gadget,99.99,4,0,2024-01-01 18:00:00
4,1005,8,Gizmo,29.99,4,0,2024-01-02 00:00:00


In [8]:
# Rename with errors='ignore' - skip columns that don't exist
result = rename_columns(state, RenameColumnsInput(
    dataframe_name='orders_renamed',
    mapping={
        'price': 'item_price',
        'missing_col': 'new_name'  # This column doesn't exist
    },
    errors='ignore'
))

print(result.message)
print(f"Columns: {list(state.get_dataframe('orders_renamed').columns)}")

Renamed 1 column(s): 'price' → 'item_price'
Columns: ['order_id', 'customer_id', 'product_name', 'item_price', 'quantity', 'discount_percent', 'order_date']


---
## 3. Add Column

Add new columns using expressions (computed from existing columns) or constant values.

In [9]:
from stats_compass_core.data.add_column import AddColumnInput, add_column

In [10]:
# Calculate subtotal using an expression
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='subtotal',
    expression='item_price * quantity'
))

print(result.message)
state.get_dataframe('orders_renamed')[['item_price', 'quantity', 'subtotal']].head()

Added new column 'subtotal' = item_price * quantity


Unnamed: 0,item_price,quantity,subtotal
0,99.99,2,199.98
1,99.99,4,399.96
2,29.99,1,29.99
3,99.99,4,399.96
4,29.99,4,119.96


In [11]:
# Calculate discount amount
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='discount_amount',
    expression='subtotal * discount_percent / 100'
))

print(result.message)

Added new column 'discount_amount' = subtotal * discount_percent / 100


In [12]:
# Calculate final total
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='total',
    expression='subtotal - discount_amount'
))

print(result.message)
state.get_dataframe('orders_renamed')[['subtotal', 'discount_percent', 'discount_amount', 'total']].head()

Added new column 'total' = subtotal - discount_amount


Unnamed: 0,subtotal,discount_percent,discount_amount,total
0,199.98,10,19.998,179.982
1,399.96,15,59.994,339.966
2,29.99,10,2.999,26.991
3,399.96,0,0.0,399.96
4,119.96,0,0.0,119.96


In [13]:
# Add a constant value column
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='currency',
    value='USD'
))

print(result.message)

Added new column 'currency' = 'USD'


In [14]:
# Add a numeric constant
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='tax_rate',
    value=0.08  # 8% tax
))

print(result.message)

Added new column 'tax_rate' = 0.08


In [15]:
# Calculate tax amount using the tax_rate
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='tax_amount',
    expression='total * tax_rate'
))

print(result.message)

Added new column 'tax_amount' = total * tax_rate


In [16]:
# Final total with tax
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='grand_total',
    expression='total + tax_amount'
))

print(result.message)
state.get_dataframe('orders_renamed')[['total', 'tax_rate', 'tax_amount', 'grand_total', 'currency']].head()

Added new column 'grand_total' = total + tax_amount


Unnamed: 0,total,tax_rate,tax_amount,grand_total,currency
0,179.982,0.08,14.39856,194.38056,USD
1,339.966,0.08,27.19728,367.16328,USD
2,26.991,0.08,2.15928,29.15028,USD
3,399.96,0.08,31.9968,431.9568,USD
4,119.96,0.08,9.5968,129.5568,USD


In [17]:
# Overwrite an existing column (update tax_rate to 10%)
result = add_column(state, AddColumnInput(
    dataframe_name='orders_renamed',
    column_name='tax_rate',  # Already exists
    value=0.10
))

print(result.message)  # Will say "Updated existing column"

Updated existing column 'tax_rate' = 0.1


---
## 4. Combined Workflow

Use all three tools together to prepare data for analysis.

In [18]:
# Start fresh with original data
raw_data = pd.DataFrame({
    'emp_id': [1, 2, 3, 4, 5],
    'emp_name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'dept_code': ['ENG', 'MKT', 'ENG', 'HR', 'MKT'],
    'base_salary': [75000, 65000, 80000, 55000, 70000],
    'years_exp': [5, 3, 7, 2, 4],
    'temp_col_1': 'delete_me',
    'temp_col_2': 'also_delete',
})
state.set_dataframe(raw_data, name='employees_raw', operation='create')
print("Raw data:")
raw_data

Raw data:


Unnamed: 0,emp_id,emp_name,dept_code,base_salary,years_exp,temp_col_1,temp_col_2
0,1,Alice,ENG,75000,5,delete_me,also_delete
1,2,Bob,MKT,65000,3,delete_me,also_delete
2,3,Charlie,ENG,80000,7,delete_me,also_delete
3,4,Diana,HR,55000,2,delete_me,also_delete
4,5,Eve,MKT,70000,4,delete_me,also_delete


In [19]:
# Step 1: Drop temporary columns
drop_columns(state, DropColumnsInput(
    dataframe_name='employees_raw',
    columns=['temp_col_1', 'temp_col_2'],
    save_as='employees_step1'
))
print("Step 1: Dropped temp columns")

Step 1: Dropped temp columns


In [20]:
# Step 2: Rename columns to be more descriptive
rename_columns(state, RenameColumnsInput(
    dataframe_name='employees_step1',
    mapping={
        'emp_id': 'employee_id',
        'emp_name': 'name',
        'dept_code': 'department',
        'years_exp': 'experience_years'
    },
    save_as='employees_step2'
))
print("Step 2: Renamed columns")

Step 2: Renamed columns


In [21]:
# Step 3: Add computed columns
# Experience bonus: $2000 per year of experience
add_column(state, AddColumnInput(
    dataframe_name='employees_step2',
    column_name='experience_bonus',
    expression='experience_years * 2000'
))

# Total compensation
add_column(state, AddColumnInput(
    dataframe_name='employees_step2',
    column_name='total_compensation',
    expression='base_salary + experience_bonus',
    save_as='employees_final'
))

print("Step 3: Added computed columns")

Step 3: Added computed columns


In [22]:
# View final result
print("\nFinal employee data:")
state.get_dataframe('employees_final')


Final employee data:


Unnamed: 0,employee_id,name,department,base_salary,experience_years,experience_bonus,total_compensation
0,1,Alice,ENG,75000,5,10000,85000
1,2,Bob,MKT,65000,3,6000,71000
2,3,Charlie,ENG,80000,7,14000,94000
3,4,Diana,HR,55000,2,4000,59000
4,5,Eve,MKT,70000,4,8000,78000


---
## Summary

Column operation tools provide essential DataFrame manipulation:

| Tool | Purpose | Key Parameters |
|------|---------|----------------|
| `drop_columns` | Remove columns | `columns` (list), `errors` |
| `rename_columns` | Rename columns | `mapping` (dict) |
| `add_column` | Add/transform columns | `column_name`, `expression` OR `value` |

All tools support `save_as` to create new DataFrames without modifying the original.

In [23]:
# Check state
summary = state.get_state_summary()
print(f"DataFrames in state: {len(summary['dataframes'])}")
for df_info in summary['dataframes']:
    print(f"  - {df_info['name']}: {df_info['shape'][0]} rows × {df_info['shape'][1]} cols")

DataFrames in state: 7
  - orders: 100 rows × 9 cols
  - orders_clean: 100 rows × 7 cols
  - orders_renamed: 100 rows × 14 cols
  - employees_raw: 5 rows × 7 cols
  - employees_step1: 5 rows × 5 cols
  - employees_step2: 5 rows × 6 cols
  - employees_final: 5 rows × 7 cols
