# Transaction DP System - Production Test Notebook

This notebook provides comprehensive testing of the differential privacy pipeline for transaction data.

**Test Coverage:**
1. ‚úÖ Data generation and loading
2. ‚úÖ Privacy configuration and budget allocation
3. ‚úÖ User-level DP parameters (D_max, K, sensitivities)
4. ‚úÖ Pipeline execution with top-down algorithm
5. ‚úÖ Privacy guarantee verification
6. ‚úÖ Utility evaluation metrics

**Key Privacy Concepts:**
- **zCDP (œÅ-zCDP)**: Privacy budget measured in rho, converts to (Œµ,Œ¥)-DP
- **User-level DP**: Protects entire card's transaction history (not just single transactions)
- **Global Sensitivity**: sqrt(M √ó D_max) √ó K where M=max cells per card, D_max=max distinct days
- **Sequential Composition**: Budget accumulates across days within a month


---
## 1. Setup & Environment Configuration

Configure logging, imports, and verify environment.


In [None]:
import sys
import os
import logging
import math
from datetime import datetime
from fractions import Fraction

# Configure logging to print to stdout (Jupyter/terminal)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(name)s - %(message)s',
    datefmt='%H:%M:%S',
    handlers=[logging.StreamHandler(sys.stdout)],
    force=True  # Override any existing config
)

# Set log level for all transaction_dp loggers
logging.getLogger('transaction_dp').setLevel(logging.INFO)
logging.getLogger('py4j').setLevel(logging.WARNING)  # Reduce Spark noise

logger = logging.getLogger('demo_notebook')

# Print environment info
print("="*70)
print("ENVIRONMENT INFORMATION")
print("="*70)
print(f"Python Version: {sys.version}")
print(f"Working Directory: {os.getcwd()}")
print(f"Timestamp: {datetime.now().isoformat()}")

# Check required files exist
required_files = [
    'data/city_province.csv',
    'core/config.py',
    'core/pipeline.py',
    'core/sensitivity.py',
    'engine/topdown.py'
]
print(f"\nRequired Files Check:")
for f in required_files:
    exists = os.path.exists(f)
    status = "‚úÖ" if exists else "‚ùå"
    print(f"  {status} {f}")
    if not exists:
        raise FileNotFoundError(f"Required file missing: {f}")

print("\n‚úÖ Environment setup complete!")


---
## 2. Spark Configuration & Initialization


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Spark configuration - adjust based on your machine
SPARK_MASTER = "local[*]"  # Use all available cores
SPARK_APP_NAME = "TransactionDP-Test"
SPARK_EXECUTOR_MEMORY = "8g"  # Adjust based on available RAM
SPARK_DRIVER_MEMORY = "8g"

print("="*70)
print("SPARK CONFIGURATION")
print("="*70)
print(f"  Master: {SPARK_MASTER}")
print(f"  App Name: {SPARK_APP_NAME}")
print(f"  Executor Memory: {SPARK_EXECUTOR_MEMORY}")
print(f"  Driver Memory: {SPARK_DRIVER_MEMORY}")

# Stop any existing Spark session
existing_session = SparkSession.getActiveSession()
if existing_session:
    print("\nStopping existing Spark session...")
    existing_session.stop()
    import time
    time.sleep(0.5)

# Create Spark session
spark = SparkSession.builder \
    .appName(SPARK_APP_NAME) \
    .master(SPARK_MASTER) \
    .config("spark.executor.memory", SPARK_EXECUTOR_MEMORY) \
    .config("spark.driver.memory", SPARK_DRIVER_MEMORY) \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .getOrCreate()

# Verify Spark session
actual_master = spark.sparkContext.master
actual_parallelism = spark.sparkContext.defaultParallelism

print(f"\n‚úÖ Spark session initialized!")
print(f"  Actual Master: {actual_master}")
print(f"  Default Parallelism: {actual_parallelism}")
print(f"  Spark Version: {spark.version}")

# Helper functions
def show_df(df, n=20, truncate=True):
    """Display Spark DataFrame in notebook."""
    df.show(n=n, truncate=truncate)
    
def to_pandas_safe(df, max_rows=100000):
    """Convert Spark DataFrame to Pandas, but only if small enough."""
    count = df.count()
    if count > max_rows:
        raise ValueError(f"DataFrame too large ({count:,} rows). Use Spark operations.")
    return df.toPandas()

print("\nüìù Helper functions available: show_df(), to_pandas_safe()")


---
## 3. Generate Test Data

Generate synthetic transaction data for testing.


In [None]:
from examples.generate_sample_data import generate_sample_data

# Test data configuration
# For production testing, increase NUM_RECORDS to 100K-1M
NUM_RECORDS = 50000       # Number of transactions
NUM_DAYS = 30             # Time span in days (1 month)
NUM_CARDS = 2000          # Number of unique cards
NUM_ACCEPTORS = 500       # Number of unique merchants
SEED = 42                 # Random seed for reproducibility

# Paths
CITY_PROVINCE_PATH = 'data/city_province.csv'
DATA_OUTPUT_PATH = 'data/demo_transactions.parquet'

print("="*70)
print("DATA GENERATION CONFIGURATION")
print("="*70)
print(f"  Records: {NUM_RECORDS:,}")
print(f"  Days: {NUM_DAYS}")
print(f"  Unique Cards: {NUM_CARDS:,}")
print(f"  Unique Acceptors: {NUM_ACCEPTORS:,}")
print(f"  Random Seed: {SEED}")
print(f"  Output: {DATA_OUTPUT_PATH}")

# Generate data
print(f"\nGenerating {NUM_RECORDS:,} transactions...")
generate_sample_data(
    num_records=NUM_RECORDS,
    output_path=DATA_OUTPUT_PATH,
    city_province_path=CITY_PROVINCE_PATH,
    num_days=NUM_DAYS,
    num_cards=NUM_CARDS,
    num_acceptors=NUM_ACCEPTORS,
    seed=SEED,
    spark_master=SPARK_MASTER,
    output_format='parquet'
)

print(f"\n‚úÖ Data generated: {DATA_OUTPUT_PATH}")


---
## 4. Load and Analyze Raw Data

Understand data characteristics for privacy parameter tuning.


In [None]:
# Load data
print("Loading data...")
df_spark = spark.read.parquet(DATA_OUTPUT_PATH)

# Basic statistics
total_count = df_spark.count()

print("="*70)
print("RAW DATA ANALYSIS")
print("="*70)
print(f"\nTotal records: {total_count:,}")
print(f"\nSchema:")
df_spark.printSchema()

# Unique counts
unique_cards = df_spark.select('card_number').distinct().count()
unique_acceptors = df_spark.select('acceptor_id').distinct().count()
unique_cities = df_spark.select('acceptor_city').distinct().count()
unique_mccs = df_spark.select('mcc').distinct().count()

print(f"\nüìä Unique Counts:")
print(f"  Cards: {unique_cards:,}")
print(f"  Acceptors: {unique_acceptors:,}")
print(f"  Cities: {unique_cities:,}")
print(f"  MCCs: {unique_mccs:,}")

# Date and amount ranges
stats = df_spark.agg(
    F.min('transaction_date').alias('min_date'),
    F.max('transaction_date').alias('max_date'),
    F.min('amount').alias('min_amount'),
    F.max('amount').alias('max_amount'),
    F.avg('amount').alias('avg_amount'),
    F.stddev('amount').alias('std_amount'),
    F.percentile_approx('amount', 0.99).alias('p99_amount')
).collect()[0]

print(f"\nüìÖ Date Range: {stats['min_date']} to {stats['max_date']}")
print(f"\nüí∞ Amount Statistics:")
print(f"  Min: {stats['min_amount']:,.0f}")
print(f"  Max: {stats['max_amount']:,.0f}")
print(f"  Mean: {stats['avg_amount']:,.0f}")
print(f"  Std Dev: {stats['std_amount']:,.0f}")
print(f"  99th Percentile: {stats['p99_amount']:,.0f}")

# Sample data
print(f"\nüìù Sample rows:")
show_df(df_spark, n=5)


### 4.1 User-Level DP Parameters Analysis

Compute critical parameters for user-level differential privacy:
- **M**: Max cells (city√óMCC√óday combinations) a single card appears in
- **D_max**: Max distinct days a single card makes transactions
- **K**: Per-cell contribution bound


In [None]:
print("="*70)
print("USER-LEVEL DP PARAMETER ANALYSIS")
print("="*70)

# Compute M: Max cells per card
# A cell is (city, mcc, day) combination
cells_per_card = df_spark.groupBy('card_number', 'acceptor_city', 'mcc', 'transaction_date') \
    .count() \
    .groupBy('card_number') \
    .agg(F.count('*').alias('num_cells'))

M_stats = cells_per_card.agg(
    F.max('num_cells').alias('max_M'),
    F.avg('num_cells').alias('avg_M'),
    F.percentile_approx('num_cells', 0.99).alias('p99_M'),
    F.percentile_approx('num_cells', 0.95).alias('p95_M')
).collect()[0]

print(f"\nüìä M (Max Cells per Card):")
print(f"  Max: {M_stats['max_M']}")
print(f"  99th Percentile: {M_stats['p99_M']}")
print(f"  95th Percentile: {M_stats['p95_M']}")
print(f"  Mean: {M_stats['avg_M']:.2f}")

# Compute D_max: Max distinct days per card
days_per_card = df_spark.groupBy('card_number') \
    .agg(F.countDistinct('transaction_date').alias('num_days'))

D_stats = days_per_card.agg(
    F.max('num_days').alias('max_D'),
    F.avg('num_days').alias('avg_D'),
    F.percentile_approx('num_days', 0.99).alias('p99_D')
).collect()[0]

print(f"\nüìÖ D_max (Max Distinct Days per Card):")
print(f"  Max: {D_stats['max_D']}")
print(f"  99th Percentile: {D_stats['p99_D']}")
print(f"  Mean: {D_stats['avg_D']:.2f}")

# Compute K: Transactions per cell
txns_per_cell = df_spark.groupBy('card_number', 'acceptor_city', 'mcc', 'transaction_date') \
    .agg(F.count('*').alias('txns_in_cell'))

K_stats = txns_per_cell.agg(
    F.max('txns_in_cell').alias('max_K'),
    F.avg('txns_in_cell').alias('avg_K'),
    F.percentile_approx('txns_in_cell', 0.99).alias('p99_K'),
    F.percentile_approx('txns_in_cell', 0.75).alias('p75_K')
).collect()[0]

print(f"\nüî¢ K (Transactions per Card per Cell):")
print(f"  Max: {K_stats['max_K']}")
print(f"  99th Percentile: {K_stats['p99_K']}")
print(f"  75th Percentile: {K_stats['p75_K']}")
print(f"  Mean: {K_stats['avg_K']:.2f}")

# Store computed values for later use
COMPUTED_M = int(M_stats['max_M'])
COMPUTED_D_MAX = int(D_stats['max_D'])
COMPUTED_K = int(K_stats['p99_K'])  # Use 99th percentile for bounded contribution

print(f"\n" + "="*70)
print(f"COMPUTED PARAMETERS FOR DP:")
print(f"  M (max cells per card): {COMPUTED_M}")
print(f"  D_max (max days per card): {COMPUTED_D_MAX}")
print(f"  K (contribution bound): {COMPUTED_K}")
print(f"  sqrt(M √ó D_max) √ó K = {math.sqrt(COMPUTED_M * COMPUTED_D_MAX) * COMPUTED_K:.2f}")
print("="*70)


---
## 5. Configure DP Pipeline

Set up differential privacy configuration with all parameters.


In [None]:
from core.config import Config

# Create configuration
config = Config()

# === DATA SETTINGS ===
config.data.input_path = DATA_OUTPUT_PATH
config.data.output_path = 'output/demo_dp_results'
config.data.city_province_path = CITY_PROVINCE_PATH
config.data.input_format = 'parquet'
config.data.num_days = NUM_DAYS
config.data.winsorize_percentile = 99.0  # Cap amounts at 99th percentile

# === PRIVACY SETTINGS ===
# Total privacy budget (rho for zCDP)
# Rule of thumb: rho=1 gives strong utility, rho=0.25 gives strong privacy
config.privacy.total_rho = Fraction(1, 2)  # rho = 0.5
config.privacy.delta = 1e-10

# Geographic budget split (Province vs City level)
config.privacy.geographic_split = {
    'province': 0.2,  # 20% for province-level aggregates
    'city': 0.8       # 80% for city-level aggregates
}

# Query budget split - allocate more to primary queries
config.privacy.query_split = {
    'transaction_count': 0.20,
    'unique_cards': 0.30,       # Higher weight for primary query
    'unique_acceptors': 0.30,   # Higher weight for primary query
    'total_amount': 0.20
}

# Bounded contribution settings
config.privacy.contribution_bound_method = 'percentile'
config.privacy.contribution_bound_percentile = 99.0

# Suppression settings
config.privacy.suppression_threshold = 5

# Sensitivity method
config.privacy.sensitivity_method = 'global'

# MCC grouping for stratified sensitivity
config.privacy.mcc_grouping_enabled = True
config.privacy.mcc_num_groups = 5

# Confidence intervals
config.privacy.confidence_levels = [0.90, 0.95]

# === SPARK SETTINGS ===
config.spark.app_name = SPARK_APP_NAME
config.spark.master = SPARK_MASTER
config.spark.executor_memory = SPARK_EXECUTOR_MEMORY
config.spark.driver_memory = SPARK_DRIVER_MEMORY

# Validate configuration
config.validate()

print("="*70)
print("DP CONFIGURATION SUMMARY")
print("="*70)
print(f"\nüìä Privacy Budget:")
print(f"  Total œÅ (rho): {config.privacy.total_rho} = {float(config.privacy.total_rho):.4f}")
print(f"  Œ¥ (delta): {config.privacy.delta}")

# Convert zCDP to (Œµ,Œ¥)-DP for reference
rho = float(config.privacy.total_rho)
delta = config.privacy.delta
epsilon = rho + 2 * math.sqrt(rho * math.log(1/delta))
print(f"  Equivalent (Œµ,Œ¥)-DP: Œµ ‚âà {epsilon:.2f}, Œ¥ = {delta}")

print(f"\nüó∫Ô∏è Geographic Budget Split:")
for level, weight in config.privacy.geographic_split.items():
    level_rho = rho * weight
    print(f"  {level.capitalize()}: {weight:.0%} ‚Üí œÅ = {level_rho:.4f}")

print(f"\nüìã Query Budget Split:")
for query, weight in config.privacy.query_split.items():
    query_rho = rho * weight
    print(f"  {query}: {weight:.0%} ‚Üí œÅ = {query_rho:.4f}")

print(f"\nüîß Other Settings:")
print(f"  Contribution Bound Method: {config.privacy.contribution_bound_method}")
print(f"  Suppression Threshold: {config.privacy.suppression_threshold}")
print(f"  Sensitivity Method: {config.privacy.sensitivity_method}")
print(f"  MCC Grouping: {'Enabled' if config.privacy.mcc_grouping_enabled else 'Disabled'}")

print(f"\n‚úÖ Configuration validated!")


---
## 6. Run DP Pipeline

Execute the differential privacy pipeline with Top-Down Algorithm.


In [None]:
from core.pipeline import DPPipeline

print("="*70)
print("EXECUTING DP PIPELINE")
print("="*70)

start_time = datetime.now()

# Create and run pipeline
pipeline = DPPipeline(config)
result = pipeline.run()

end_time = datetime.now()
duration = (end_time - start_time).total_seconds()

print("\n" + "="*70)
print("PIPELINE RESULTS")
print("="*70)

if result['success']:
    print(f"\n‚úÖ SUCCESS!")
else:
    print(f"\n‚ùå FAILED!")

print(f"\nüìä Execution Summary:")
print(f"  Records Processed: {result.get('total_records', 'N/A'):,}")
print(f"  Privacy Budget Used: œÅ = {result.get('budget_used', 'N/A')}")
print(f"  Duration: {duration:.2f} seconds")
print(f"  Output Path: {result.get('output_path', 'N/A')}")

if result.get('errors'):
    print(f"\n‚ö†Ô∏è Errors:")
    for error in result['errors']:
        print(f"    - {error}")


---
## 7. Privacy Verification

Verify that privacy guarantees are correctly implemented.


In [None]:
print("="*70)
print("PRIVACY GUARANTEE VERIFICATION")
print("="*70)

if not result['success']:
    print("‚ö†Ô∏è Pipeline failed - skipping privacy verification")
else:
    import json
    
    # Load metadata
    output_path = config.data.output_path
    metadata_path = os.path.join(output_path, 'metadata.json')
    
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        print(f"\nüìã Pipeline Metadata:")
        print(json.dumps(metadata, indent=2))
    
    # Verify budget composition
    print(f"\nüîê Budget Composition Verification:")
    total_rho = float(config.privacy.total_rho)
    print(f"  Total Budget: œÅ = {total_rho}")
    
    # Geographic composition
    geo_rho_sum = sum(total_rho * w for w in config.privacy.geographic_split.values())
    print(f"  Geographic Split Sum: {geo_rho_sum:.4f} (should = {total_rho})")
    geo_check = abs(geo_rho_sum - total_rho) < 1e-6
    print(f"  Geographic Composition: {'‚úÖ VALID' if geo_check else '‚ùå INVALID'}")
    
    # Query composition
    query_sum = sum(config.privacy.query_split.values())
    print(f"  Query Split Sum: {query_sum:.4f} (should = 1.0)")
    query_check = abs(query_sum - 1.0) < 1e-6
    print(f"  Query Composition: {'‚úÖ VALID' if query_check else '‚ùå INVALID'}")
    
    # Sensitivity verification
    print(f"\nüéØ Sensitivity Verification:")
    d_max = config.privacy.computed_d_max or COMPUTED_D_MAX
    k_bound = config.privacy.computed_contribution_bound or COMPUTED_K
    
    print(f"  D_max (max days): {d_max}")
    print(f"  K (contribution bound): {k_bound}")
    print(f"  M (max cells): {COMPUTED_M}")
    
    sqrt_md = math.sqrt(COMPUTED_M * d_max)
    sens_count = sqrt_md * k_bound
    sens_unique = sqrt_md * 1
    
    print(f"\n  Expected Sensitivities (L2):")
    print(f"    transaction_count: ‚àö(M√óD_max)√óK = {sens_count:.2f}")
    print(f"    unique_cards: ‚àö(M√óD_max)√ó1 = {sens_unique:.2f}")
    print(f"    unique_acceptors: ‚àö(M√óD_max)√ó1 = {sens_unique:.2f}")
    
    # Privacy guarantee summary
    print(f"\nüìú PRIVACY GUARANTEE SUMMARY:")
    print(f"  Mechanism: Discrete Gaussian (zCDP)")
    print(f"  Privacy Unit: (card_number, month)")
    print(f"  Composition: Sequential across days, Parallel across cells")
    print(f"  Total Budget: œÅ = {total_rho} zCDP")
    print(f"  Equivalent (Œµ,Œ¥)-DP: Œµ ‚âà {epsilon:.2f}, Œ¥ = {delta}")
    
    if geo_check and query_check:
        print(f"\n‚úÖ Privacy verification PASSED!")
    else:
        print(f"\n‚ùå Privacy verification FAILED!")


---
## 8. View Results

Load and examine the DP-protected output.


In [None]:
import json

print("="*70)
print("DP-PROTECTED OUTPUT")
print("="*70)

output_path = config.data.output_path

if os.path.exists(output_path):
    print(f"\nüìÅ Output directory: {output_path}")
    print(f"\nContents:")
    for item in os.listdir(output_path):
        item_path = os.path.join(output_path, item)
        if os.path.isfile(item_path):
            size = os.path.getsize(item_path)
            print(f"  - {item} ({size:,} bytes)")
        else:
            print(f"  - {item}/")
    
    # Load metadata
    metadata_path = os.path.join(output_path, 'metadata.json')
    if os.path.exists(metadata_path):
        print("\nüìã Metadata:")
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        print(json.dumps(metadata, indent=2))
    
    # Load protected data
    protected_data_path = os.path.join(output_path, "protected_data")
    if os.path.exists(protected_data_path):
        print(f"\nüìä Loading protected data...")
        dp_df = spark.read.parquet(protected_data_path)
        dp_count = dp_df.count()
        print(f"  Protected cells: {dp_count:,}")
        print(f"\n  Sample:")
        show_df(dp_df, n=10)
else:
    print(f"‚ùå Output directory not found: {output_path}")


---
## 9. Utility Evaluation

Compare original vs DP-protected data to measure utility loss.


In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from pyspark.sql.functions import col, count, countDistinct, sum as spark_sum

print("="*70)
print("UTILITY EVALUATION")
print("="*70)

if not result['success']:
    print("‚ö†Ô∏è Pipeline failed - skipping utility evaluation")
else:
    # Aggregate original data to same granularity
    print("\nüìä Aggregating original data...")
    original_agg = df_spark.groupBy('acceptor_city', 'mcc', 'transaction_date').agg(
        count('transaction_id').alias('transaction_count'),
        countDistinct('card_number').alias('unique_cards'),
        countDistinct('acceptor_id').alias('unique_acceptors'),
        spark_sum('amount').alias('total_amount')
    )
    
    orig_count = original_agg.count()
    print(f"  Original cells: {orig_count:,}")
    
    # Load DP data
    protected_data_path = os.path.join(output_path, "protected_data")
    dp_agg = spark.read.parquet(protected_data_path)
    dp_count = dp_agg.count()
    print(f"  DP-protected cells: {dp_count:,}")
    
    # Compare totals using Spark
    print(f"\n" + "="*70)
    print("AGGREGATE LEVEL COMPARISON")
    print("="*70)
    
    NUMERIC_COLS = ['transaction_count', 'unique_cards', 'unique_acceptors', 'total_amount']
    
    for col_name in NUMERIC_COLS:
        orig_total = original_agg.agg(spark_sum(col_name)).collect()[0][0] or 0
        dp_total = dp_agg.agg(spark_sum(col_name)).collect()[0][0] or 0
        
        if orig_total > 0:
            error_pct = abs(dp_total - orig_total) / orig_total * 100
            status = "‚úÖ" if error_pct < 5 else ("‚ö†Ô∏è" if error_pct < 15 else "‚ùå")
        else:
            error_pct = 0
            status = "‚ö†Ô∏è"
        
        print(f"\n{col_name}:")
        print(f"  Original Total: {orig_total:,.0f}")
        print(f"  DP Total: {dp_total:,.0f}")
        print(f"  Error: {error_pct:.2f}% {status}")


---
## 10. Production Readiness Checklist

Verify the system is ready for production deployment.


In [None]:
print("="*70)
print("PRODUCTION READINESS CHECKLIST")
print("="*70)

checks = []

# 1. Pipeline Success
check_1 = result['success']
checks.append(('Pipeline Execution', check_1))
print(f"\n{'‚úÖ' if check_1 else '‚ùå'} Pipeline Execution: {'PASSED' if check_1 else 'FAILED'}")

# 2. Output Files Exist
output_exists = os.path.exists(os.path.join(output_path, 'protected_data'))
checks.append(('Output Files', output_exists))
print(f"{'‚úÖ' if output_exists else '‚ùå'} Output Files: {'EXIST' if output_exists else 'MISSING'}")

# 3. Metadata Present
metadata_exists = os.path.exists(os.path.join(output_path, 'metadata.json'))
checks.append(('Metadata', metadata_exists))
print(f"{'‚úÖ' if metadata_exists else '‚ùå'} Metadata: {'PRESENT' if metadata_exists else 'MISSING'}")

# 4. Budget Composition Valid
budget_valid = abs(sum(config.privacy.geographic_split.values()) - 1.0) < 1e-6
budget_valid = budget_valid and abs(sum(config.privacy.query_split.values()) - 1.0) < 1e-6
checks.append(('Budget Composition', budget_valid))
print(f"{'‚úÖ' if budget_valid else '‚ùå'} Budget Composition: {'VALID' if budget_valid else 'INVALID'}")

# 5. No Negative Counts (sanity check)
if output_exists:
    dp_df = spark.read.parquet(os.path.join(output_path, 'protected_data'))
    neg_counts = dp_df.filter(F.col('transaction_count') < 0).count()
    no_negative = neg_counts == 0
    checks.append(('No Negative Counts', no_negative))
    print(f"{'‚úÖ' if no_negative else '‚ö†Ô∏è'} No Negative Counts: {'PASSED' if no_negative else f'{neg_counts} negative values'}")

# 6. Reasonable Processing Time
reasonable_time = duration < 300  # 5 minutes for test data
checks.append(('Processing Time', reasonable_time))
print(f"{'‚úÖ' if reasonable_time else '‚ö†Ô∏è'} Processing Time: {duration:.1f}s {'(OK)' if reasonable_time else '(SLOW)'}")

# Summary
all_passed = all(c[1] for c in checks)
passed_count = sum(1 for c in checks if c[1])

print(f"\n" + "="*70)
print(f"SUMMARY: {passed_count}/{len(checks)} checks passed")
print("="*70)

if all_passed:
    print(f"\nüéâ PRODUCTION READY!")
    print(f"   The DP system has passed all checks and is ready for deployment.")
else:
    print(f"\n‚ö†Ô∏è NOT READY FOR PRODUCTION")
    print(f"   Please address the failed checks before deployment.")
    failed = [c[0] for c in checks if not c[1]]
    print(f"   Failed: {', '.join(failed)}")


---
## 11. Cleanup & Summary


In [None]:
# Uncomment to clean up generated files
# import shutil
# 
# if os.path.exists(DATA_OUTPUT_PATH):
#     if os.path.isdir(DATA_OUTPUT_PATH):
#         shutil.rmtree(DATA_OUTPUT_PATH)
#     else:
#         os.remove(DATA_OUTPUT_PATH)
#     print(f"Removed: {DATA_OUTPUT_PATH}")
# 
# if os.path.exists(config.data.output_path):
#     shutil.rmtree(config.data.output_path)
#     print(f"Removed: {config.data.output_path}")

print("="*70)
print("NOTEBOOK COMPLETE")
print("="*70)
print(f"\nTimestamp: {datetime.now().isoformat()}")
print(f"\nüìã Summary:")
print(f"  - Records processed: {result.get('total_records', 'N/A'):,}")
print(f"  - Privacy budget: œÅ = {config.privacy.total_rho}")
print(f"  - Pipeline status: {'‚úÖ SUCCESS' if result['success'] else '‚ùå FAILED'}")
print(f"  - Production ready: {'‚úÖ YES' if all_passed else '‚ùå NO'}")
