# Papermill Demo Notebook

This notebook demonstrates how to use Papermill for parameterized notebook execution.

## Parameters
- `input_data`: Path to input data
- `output_path`: Path for output results
- `processing_date`: Date for data processing

In [None]:
# Parameters - these will be injected by Papermill
input_data = 'default_input.csv'
output_path = 'default_output.csv'
processing_date = '2024-01-01'

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

print(f"Processing data for: {processing_date}")
print(f"Input data: {input_data}")
print(f"Output path: {output_path}")

In [None]:
# Sample data processing
print("Creating sample data...")

# Generate sample data
np.random.seed(42)
data = {
    'id': range(1, 101),
    'value': np.random.normal(100, 15, 100),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'date': pd.date_range(start=processing_date, periods=100, freq='D')
}

df = pd.DataFrame(data)
print(f"Generated {len(df)} rows of data")
print(df.head())

In [None]:
# Perform analysis
print("\nPerforming analysis...")

summary_stats = df.groupby('category')['value'].agg(['mean', 'std', 'count'])
print("\nSummary by category:")
print(summary_stats)

# Calculate overall metrics
total_records = len(df)
avg_value = df['value'].mean()
max_value = df['value'].max()
min_value = df['value'].min()

print(f"\nOverall metrics:")
print(f"Total records: {total_records}")
print(f"Average value: {avg_value:.2f}")
print(f"Max value: {max_value:.2f}")
print(f"Min value: {min_value:.2f}")

In [None]:
# Save results
print(f"\nSaving results to: {output_path}")

# In a real scenario, you would save to the actual output_path
# For demo purposes, we'll just print the save operation
print("Results saved successfully!")

# Return results for Papermill
results = {
    'total_records': total_records,
    'avg_value': avg_value,
    'processing_date': processing_date,
    'output_file': output_path
}

print("\nProcessing completed!")
print(f"Results: {results}")

## Conclusion

This notebook demonstrates:
1. **Parameter injection** using the `parameters` tag
2. **Data processing** with pandas
3. **Results generation** for downstream consumption

To run this notebook with Papermill:

```bash
papermill notebooks/papermill_demo.ipynb output.ipynb \
    -p input_data "my_data.csv" \
    -p output_path "results.csv" \
    -p processing_date "2024-01-15"
```

# Papermill Demo - Data Analysis Pipeline

This notebook demonstrates Papermill integration with Airflow for automated data processing.
It can be executed from Airflow DAGs using the PapermillOperator.

In [None]:
# Parameters - these can be overridden by Papermill
input_data_path = "/home/jovyan/work/data/sales_data.csv"
output_path = "/home/jovyan/work/output/"
analysis_date = "2025-07-24"
region_filter = "all"
create_visualizations = True

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import numpy as np

print(f"📊 Starting data analysis for {analysis_date}")
print(f"📂 Input path: {input_data_path}")
print(f"📁 Output path: {output_path}")
print(f"🌍 Region filter: {region_filter}")

In [None]:
# Create sample data if it doesn't exist
if not os.path.exists(input_data_path):
    print("🔧 Creating sample sales data...")
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(input_data_path), exist_ok=True)
    
    # Generate sample data
    np.random.seed(42)
    dates = pd.date_range('2025-01-01', '2025-07-24', freq='D')
    regions = ['North', 'South', 'East', 'West']
    products = ['Product_A', 'Product_B', 'Product_C', 'Product_D', 'Product_E']
    
    data = []
    for date in dates:
        for _ in range(np.random.randint(10, 50)):
            data.append({
                'date': date,
                'region': np.random.choice(regions),
                'product': np.random.choice(products),
                'sales_amount': np.random.uniform(100, 1000),
                'quantity': np.random.randint(1, 10),
                'customer_id': f"CUST_{np.random.randint(1000, 9999)}"
            })
    
    df_sample = pd.DataFrame(data)
    df_sample.to_csv(input_data_path, index=False)
    print(f"✅ Sample data created with {len(df_sample)} records")

In [None]:
# Load and analyze data
print("📖 Loading data...")
df = pd.read_csv(input_data_path)
df['date'] = pd.to_datetime(df['date'])

print(f"📈 Loaded {len(df)} records")
print(f"📅 Date range: {df['date'].min()} to {df['date'].max()}")
print(f"🌍 Regions: {df['region'].unique()}")
print(f"📦 Products: {df['product'].unique()}")

# Apply region filter if specified
if region_filter != "all":
    df = df[df['region'] == region_filter]
    print(f"🔍 Filtered to region: {region_filter} ({len(df)} records)")

df.head()

In [None]:
# Perform analysis
print("🔬 Performing analysis...")

# Summary statistics
total_sales = df['sales_amount'].sum()
total_quantity = df['quantity'].sum()
unique_customers = df['customer_id'].nunique()
avg_order_value = df['sales_amount'].mean()

# Regional analysis
regional_summary = df.groupby('region').agg({
    'sales_amount': ['sum', 'mean', 'count'],
    'quantity': 'sum',
    'customer_id': 'nunique'
}).round(2)

# Product analysis
product_summary = df.groupby('product').agg({
    'sales_amount': ['sum', 'mean'],
    'quantity': 'sum'
}).round(2)

# Daily trend
daily_sales = df.groupby('date')['sales_amount'].sum().reset_index()

print(f"💰 Total Sales: ${total_sales:,.2f}")
print(f"📦 Total Quantity: {total_quantity:,}")
print(f"👥 Unique Customers: {unique_customers:,}")
print(f"💵 Average Order Value: ${avg_order_value:.2f}")

In [None]:
# Create visualizations if requested
if create_visualizations:
    print("📊 Creating visualizations...")
    
    # Set up the plotting style
    plt.style.use('seaborn-v0_8')
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'Sales Analysis Dashboard - {analysis_date}', fontsize=16, fontweight='bold')
    
    # Regional sales
    regional_sales = df.groupby('region')['sales_amount'].sum().sort_values(ascending=True)
    regional_sales.plot(kind='barh', ax=axes[0,0], color='skyblue')
    axes[0,0].set_title('Sales by Region')
    axes[0,0].set_xlabel('Sales Amount ($)')
    
    # Product sales
    product_sales = df.groupby('product')['sales_amount'].sum().sort_values(ascending=False)
    product_sales.plot(kind='bar', ax=axes[0,1], color='lightgreen')
    axes[0,1].set_title('Sales by Product')
    axes[0,1].set_ylabel('Sales Amount ($)')
    axes[0,1].tick_params(axis='x', rotation=45)
    
    # Daily sales trend (last 30 days)
    recent_sales = daily_sales.tail(30)
    axes[1,0].plot(recent_sales['date'], recent_sales['sales_amount'], marker='o', linewidth=2)
    axes[1,0].set_title('Daily Sales Trend (Last 30 Days)')
    axes[1,0].set_ylabel('Sales Amount ($)')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Sales distribution
    axes[1,1].hist(df['sales_amount'], bins=30, alpha=0.7, color='orange')
    axes[1,1].set_title('Sales Amount Distribution')
    axes[1,1].set_xlabel('Sales Amount ($)')
    axes[1,1].set_ylabel('Frequency')
    
    plt.tight_layout()
    
    # Save visualization
    os.makedirs(output_path, exist_ok=True)
    viz_path = os.path.join(output_path, f'sales_dashboard_{analysis_date}.png')
    plt.savefig(viz_path, dpi=300, bbox_inches='tight')
    print(f"📊 Visualization saved to: {viz_path}")
    
    plt.show()

In [None]:
# Save analysis results
print("💾 Saving analysis results...")

# Create output directory
os.makedirs(output_path, exist_ok=True)

# Save summary data
summary_data = {
    'analysis_date': analysis_date,
    'region_filter': region_filter,
    'total_sales': total_sales,
    'total_quantity': total_quantity,
    'unique_customers': unique_customers,
    'avg_order_value': avg_order_value,
    'records_processed': len(df)
}

summary_df = pd.DataFrame([summary_data])
summary_path = os.path.join(output_path, f'analysis_summary_{analysis_date}.csv')
summary_df.to_csv(summary_path, index=False)

# Save detailed regional analysis
regional_path = os.path.join(output_path, f'regional_analysis_{analysis_date}.csv')
regional_summary.to_csv(regional_path)

# Save product analysis
product_path = os.path.join(output_path, f'product_analysis_{analysis_date}.csv')
product_summary.to_csv(product_path)

print(f"✅ Summary saved to: {summary_path}")
print(f"✅ Regional analysis saved to: {regional_path}")
print(f"✅ Product analysis saved to: {product_path}")

In [None]:
# Final summary for Airflow/Papermill execution
print("\n" + "="*50)
print("📋 ANALYSIS COMPLETE")
print("="*50)
print(f"📅 Analysis Date: {analysis_date}")
print(f"🌍 Region Filter: {region_filter}")
print(f"📊 Records Processed: {len(df):,}")
print(f"💰 Total Sales: ${total_sales:,.2f}")
print(f"👥 Unique Customers: {unique_customers:,}")
print(f"📁 Output Path: {output_path}")
print("\n✅ This notebook can be executed from Airflow using PapermillOperator!")
print("\n🔧 Example Airflow task:")
print("```python")
print("from airflow.providers.papermill.operators.papermill import PapermillOperator")
print("")
print("papermill_task = PapermillOperator(")
print("    task_id='run_data_analysis',")
print("    input_nb='/path/to/papermill_demo.ipynb',")
print("    output_nb='/path/to/executed_notebook.ipynb',")
print("    parameters={")
print("        'analysis_date': '{{ ds }}',")
print("        'region_filter': 'North',")
print("        'create_visualizations': True")
print("    }")
print(")")
print("```")
print("="*50)