# Data Exploration Notebook

This notebook demonstrates exploratory data analysis for our Python Data Science example.

When you modify this notebook, devloop will automatically restart Jupyter Lab to reflect your changes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.insert(0, os.path.join('..', 'src'))

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## Load Data

Let's load our processed data and examine its structure.

In [None]:
# Try to load processed data, generate if not available
try:
    customers = pd.read_csv('../data/processed/enriched_customers.csv')
    transactions = pd.read_csv('../data/processed/transactions.csv')
    print("Loaded processed data successfully!")
except FileNotFoundError:
    print("Processed data not found. Run the preprocessing pipeline first.")
    print("Or generate sample data for exploration...")
    
    # Generate sample data for demonstration
    from train import generate_synthetic_data
    sample_data = generate_synthetic_data()
    print(f"Generated sample data: {sample_data.shape}")
    customers = sample_data

## Data Overview

In [None]:
# Display basic information about the dataset
print("Dataset Shape:", customers.shape)
print("\nColumn Names:")
print(customers.columns.tolist())
print("\nData Types:")
print(customers.dtypes)
print("\nFirst few rows:")
customers.head()

In [None]:
# Statistical summary
customers.describe()

## Data Quality Check

In [None]:
# Check for missing values
missing_data = customers.isnull().sum()
print("Missing values per column:")
print(missing_data[missing_data > 0])

if missing_data.sum() == 0:
    print("✅ No missing values found!")
else:
    print(f"⚠️  Total missing values: {missing_data.sum()}")

In [None]:
# Check for duplicates
duplicates = customers.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates == 0:
    print("✅ No duplicates found!")
else:
    print(f"⚠️  Found {duplicates} duplicate rows")

## Exploratory Data Analysis

In [None]:
# Create visualization grid
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Data Distribution Analysis', fontsize=16)

# Plot distributions for numeric columns
numeric_cols = customers.select_dtypes(include=[np.number]).columns

for i, col in enumerate(numeric_cols[:4]):  # Plot first 4 numeric columns
    row = i // 2
    col_idx = i % 2
    
    axes[row, col_idx].hist(customers[col], bins=30, alpha=0.7, edgecolor='black')
    axes[row, col_idx].set_title(f'Distribution of {col}')
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numeric features
if len(numeric_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = customers[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

## Target Variable Analysis

In [None]:
# Analyze target variable if it exists
if 'target' in customers.columns:
    target_counts = customers['target'].value_counts()
    print("Target variable distribution:")
    print(target_counts)
    print(f"\nClass balance: {target_counts.min() / target_counts.max():.3f}")
    
    # Plot target distribution
    plt.figure(figsize=(8, 6))
    customers['target'].value_counts().plot(kind='bar')
    plt.title('Target Variable Distribution')
    plt.xlabel('Target Class')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.show()
else:
    print("No target variable found in the dataset.")

## Feature Relationships

In [None]:
# Pairplot for first few numeric features
if len(numeric_cols) >= 3:
    sample_cols = numeric_cols[:3].tolist()
    if 'target' in customers.columns:
        sample_cols.append('target')
    
    # Sample data if too large
    sample_size = min(1000, len(customers))
    sample_data = customers[sample_cols].sample(n=sample_size, random_state=42)
    
    plt.figure(figsize=(12, 10))
    if 'target' in sample_cols:
        sns.pairplot(sample_data, hue='target', alpha=0.7)
    else:
        sns.pairplot(sample_data, alpha=0.7)
    plt.suptitle('Feature Relationships', y=1.02)
    plt.show()

## Summary

This notebook provides a comprehensive overview of our dataset. Key findings:

1. **Data Quality**: Checked for missing values and duplicates
2. **Distributions**: Analyzed feature distributions
3. **Correlations**: Examined relationships between features
4. **Target Analysis**: Analyzed target variable distribution

Next steps:
- Feature engineering based on insights
- Model selection and training
- Performance evaluation

In [None]:
# Save analysis results
analysis_summary = {
    'total_samples': len(customers),
    'total_features': len(customers.columns),
    'missing_values': customers.isnull().sum().sum(),
    'duplicate_rows': customers.duplicated().sum(),
}

print("Analysis Summary:")
for key, value in analysis_summary.items():
    print(f"  {key}: {value}")