# Simple Analysis Notebook

A simple example notebook for Papermill parameterization.

## Parameters
- `execution_date`: Date of analysis
- `sample_size`: Number of samples to generate

In [None]:
# Parameters - these will be injected by Papermill
execution_date = '2024-01-01'
sample_size = 1000

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

print(f"Analysis Date: {execution_date}")
print(f"Sample Size: {sample_size}")
print(f"Generated at: {datetime.now()}")

In [None]:
# Generate sample data
np.random.seed(42)

data = pd.DataFrame({
    'category': np.random.choice(['A', 'B', 'C', 'D'], sample_size),
    'value': np.random.normal(100, 15, sample_size),
    'score': np.random.uniform(0, 1, sample_size)
})

print(f"Generated {len(data)} samples")
print(data.head())
print("\nData Summary:")
print(data.describe())

In [None]:
# Basic analysis
category_stats = data.groupby('category').agg({
    'value': ['mean', 'std', 'count'],
    'score': ['mean', 'std']
})

print("Category Statistics:")
print(category_stats)

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Histogram of values
axes[0, 0].hist(data['value'], bins=30, alpha=0.7)
axes[0, 0].set_title('Distribution of Values')
axes[0, 0].set_xlabel('Value')
axes[0, 0].set_ylabel('Frequency')

# Box plot by category
data.boxplot(column='value', by='category', ax=axes[0, 1])
axes[0, 1].set_title('Value Distribution by Category')

# Scatter plot
axes[1, 0].scatter(data['value'], data['score'], alpha=0.6)
axes[1, 0].set_xlabel('Value')
axes[1, 0].set_ylabel('Score')
axes[1, 0].set_title('Value vs Score')

# Category counts
data['category'].value_counts().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Category Counts')
axes[1, 1].set_xlabel('Category')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Generate summary metrics
summary_metrics = {
    'execution_date': execution_date,
    'sample_size': sample_size,
    'total_records': len(data),
    'average_value': data['value'].mean(),
    'median_value': data['value'].median(),
    'std_value': data['value'].std(),
    'average_score': data['score'].mean(),
    'categories': data['category'].nunique(),
    'generated_at': datetime.now().isoformat()
}

print("\nSummary Metrics:")
for key, value in summary_metrics.items():
    print(f"  {key}: {value}")

## Analysis Complete

This simple analysis demonstrated:
1. Parameter injection via Papermill
2. Data generation and basic statistics
3. Visualization creation
4. Summary metrics generation

This notebook can be parameterized and run automatically as part of an Airflow pipeline.