# 01 - Data Exploration

## Objective
Explore the customer churn dataset to understand:
- Feature distributions
- Class balance of target variable
- Data quality issues
- Feature correlations

## Dataset
Using sample data generated by `create_sample_data()` for development.
Replace with actual customer data CSV in production.

## 1. Setup and Data Loading

In [None]:
# Install required packages (run once)
%pip install pandas numpy matplotlib seaborn scikit-learn --quiet

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.insert(0, '..')

from src.data_loader import create_sample_data, get_data_summary, validate_data
from src.preprocessing import identify_column_types

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print('Libraries loaded successfully')

ModuleNotFoundError: No module named 'src'

In [None]:
# Generate sample data for exploration
# In production, replace with: df = load_data('data/customers.csv')
RANDOM_STATE = 42
df = create_sample_data(n_samples=5000, random_state=RANDOM_STATE, churn_rate=0.2)

print(f'Dataset shape: {df.shape}')
print(f'Columns: {list(df.columns)}')

## 2. Data Overview

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and info
df.info()

In [None]:
# Descriptive statistics for numerical columns
df.describe()

In [None]:
# Get data summary using our utility function
summary = get_data_summary(df)
print('Data Summary:')
for key, value in summary.items():
    print(f'  {key}: {value}')

## 3. Target Variable Analysis (Churn)

In [None]:
# Check class balance
churn_counts = df['churn'].value_counts()
churn_pct = df['churn'].value_counts(normalize=True) * 100

print('Churn Distribution:')
print(f'  Retained (0): {churn_counts[0]:,} ({churn_pct[0]:.1f}%)')
print(f'  Churned (1):  {churn_counts[1]:,} ({churn_pct[1]:.1f}%)')
print(f'\nChurn Rate: {churn_pct[1]:.2f}%')

In [None]:
# Visualize class balance
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Bar chart
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(['Retained', 'Churned'], churn_counts.values, color=colors)
axes[0].set_ylabel('Count')
axes[0].set_title('Churn Distribution (Count)')
for i, v in enumerate(churn_counts.values):
    axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(churn_counts.values, labels=['Retained', 'Churned'], 
            autopct='%1.1f%%', colors=colors, startangle=90)
axes[1].set_title('Churn Distribution (Percentage)')

plt.tight_layout()
plt.savefig('../models/churn_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Feature Distributions

In [None]:
# Identify column types
col_types = identify_column_types(df)
print(f'Numerical columns: {col_types["numerical"]}')
print(f'Categorical columns: {col_types["categorical"]}')

In [None]:
# Distribution of numerical features
numerical_cols = col_types['numerical']

fig, axes = plt.subplots(1, len(numerical_cols), figsize=(5*len(numerical_cols), 4))
if len(numerical_cols) == 1:
    axes = [axes]

for ax, col in zip(axes, numerical_cols):
    # Plot histogram with KDE
    sns.histplot(data=df, x=col, hue='churn', kde=True, ax=ax, palette=colors)
    ax.set_title(f'{col} Distribution by Churn')
    ax.legend(['Retained', 'Churned'])

plt.tight_layout()
plt.savefig('../models/numerical_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Distribution of categorical features
categorical_cols = col_types['categorical']

fig, axes = plt.subplots(1, len(categorical_cols), figsize=(6*len(categorical_cols), 4))
if len(categorical_cols) == 1:
    axes = [axes]

for ax, col in zip(axes, categorical_cols):
    # Calculate churn rate by category
    churn_by_cat = df.groupby(col)['churn'].agg(['sum', 'count'])
    churn_by_cat['rate'] = churn_by_cat['sum'] / churn_by_cat['count'] * 100
    
    # Plot
    bars = ax.bar(churn_by_cat.index, churn_by_cat['rate'])
    ax.set_ylabel('Churn Rate (%)')
    ax.set_title(f'Churn Rate by {col}')
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, rate in zip(bars, churn_by_cat['rate']):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                f'{rate:.1f}%', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('../models/categorical_churn_rates.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Data Quality Assessment

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})

print('Missing Values Analysis:')
print(missing_df[missing_df['Missing Count'] > 0] if missing.sum() > 0 else 'No missing values found!')

In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
duplicate_id_count = df['customer_id'].duplicated().sum()

print(f'Duplicate rows: {duplicate_count}')
print(f'Duplicate customer IDs: {duplicate_id_count}')

In [None]:
# Run validation
validation_report = validate_data(df)
print('Validation Report:')
print(f'  Rows: {validation_report["rows"]:,}')
print(f'  Columns: {validation_report["columns"]}')
print(f'  Passed: {validation_report["passed"]}')

In [None]:
# Check for outliers in numerical columns
print('Outlier Analysis (values beyond 3 standard deviations):')
for col in numerical_cols:
    mean = df[col].mean()
    std = df[col].std()
    lower = mean - 3 * std
    upper = mean + 3 * std
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f'  {col}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.2f}%)')

## 6. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
correlation_cols = numerical_cols + ['churn']
corr_matrix = df[correlation_cols].corr()

# Display correlation with target
print('Correlation with Churn:')
target_corr = corr_matrix['churn'].drop('churn').sort_values(key=abs, ascending=False)
print(target_corr)

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../models/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Key Findings Summary

### Dataset Overview
- **Samples**: 5,000 customers
- **Features**: 6 (3 numerical, 2 categorical, 1 ID)
- **Target**: Binary churn indicator

### Class Balance
- **Churn Rate**: ~20% (imbalanced dataset)
- Consider stratified sampling for train/test splits
- May need class weighting or SMOTE for model training

### Data Quality
- No missing values detected
- No duplicate records
- All required columns present

### Feature Insights
- Numerical features show different distributions between churned/retained
- Categorical features may have predictive power based on churn rate variance

### Next Steps
1. Proceed with preprocessing pipeline
2. Train baseline logistic regression model
3. Evaluate with proper cross-validation

In [None]:
print('Data exploration complete!')
print(f'\nDataset is ready for modeling:')
print(f'  - {len(df):,} samples')
print(f'  - {len(numerical_cols)} numerical features')
print(f'  - {len(categorical_cols)} categorical features')
print(f'  - {df["churn"].mean()*100:.1f}% churn rate')