# Data Exploration and Exploratory Data Analysis (EDA)

This notebook demonstrates best practices for initial data exploration and EDA in data science projects.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully!')

## 1. Load Data

Load data from the raw data directory following the Cookiecutter structure.

In [None]:
# Generate sample customer data for demonstration
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 70, n_samples),
    'tenure_months': np.random.randint(1, 72, n_samples),
    'monthly_charges': np.random.uniform(20, 120, n_samples),
    'total_charges': np.random.uniform(100, 8000, n_samples),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
    'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
    'online_security': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
    'tech_support': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
    'churn': np.random.choice([0, 1], n_samples, p=[0.73, 0.27])
})

print(f'Dataset shape: {data.shape}')
data.head()

## 2. Data Quality Assessment

Check for missing values, duplicates, and data types.

In [None]:
# Check data info
print('=== Data Info ===')
data.info()

print('\n=== Missing Values ===')
missing = data.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else 'No missing values')

print('\n=== Duplicates ===')
duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

## 3. Descriptive Statistics

Analyze numerical and categorical features.

In [None]:
# Numerical features summary
print('=== Numerical Features ===')
data.describe()

In [None]:
# Categorical features summary
print('=== Categorical Features ===')
categorical_cols = data.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f'\n{col}:')
    print(data[col].value_counts())
    print(f'Unique values: {data[col].nunique()}')

## 4. Target Variable Analysis

Analyze the distribution of the target variable (churn).

In [None]:
# Churn distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
churn_counts = data['churn'].value_counts()
axes[0].bar(['No Churn', 'Churn'], churn_counts.values, color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Churn Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Percentage
churn_pct = data['churn'].value_counts(normalize=True) * 100
axes[1].pie(churn_pct.values, labels=['No Churn', 'Churn'], autopct='%1.1f%%', 
            colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Churn Percentage', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f'\nChurn Rate: {churn_pct[1]:.2f}%')

## 5. Feature Distributions

Visualize the distribution of key numerical features.

In [None]:
# Distribution plots for numerical features
numerical_features = ['age', 'tenure_months', 'monthly_charges', 'total_charges']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(data[col], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

Examine correlations between numerical features.

In [None]:
# Correlation matrix
numerical_data = data[['age', 'tenure_months', 'monthly_charges', 'total_charges', 'churn']]
correlation_matrix = numerical_data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Feature Relationships with Target

Analyze how features relate to churn.

In [None]:
# Numerical features vs churn
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    data.boxplot(column=col, by='churn', ax=axes[idx])
    axes[idx].set_title(f'{col} by Churn Status')
    axes[idx].set_xlabel('Churn (0=No, 1=Yes)')
    axes[idx].set_ylabel(col)

plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.show()

## 8. Categorical Features Analysis

Examine churn rates across categorical features.

In [None]:
# Churn rate by categorical features
categorical_features = ['contract_type', 'payment_method', 'internet_service']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(categorical_features):
    churn_by_cat = data.groupby(col)['churn'].mean() * 100
    churn_by_cat.plot(kind='bar', ax=axes[idx], color='coral', edgecolor='black')
    axes[idx].set_title(f'Churn Rate by {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Churn Rate (%)')
    axes[idx].set_xlabel(col)
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Key Insights Summary

Document the main findings from EDA.

In [None]:
insights = """
KEY INSIGHTS FROM EDA:
========================

1. Dataset Overview:
   - Total samples: {}
   - Features: {}
   - Target variable: Churn (binary)

2. Data Quality:
   - No missing values detected
   - No duplicate records
   - All data types appropriate

3. Target Variable:
   - Overall churn rate: {:.2f}%
   - Class imbalance present (consider resampling techniques)

4. Feature Observations:
   - Tenure shows negative correlation with churn
   - Monthly charges may influence churn decisions
   - Contract type appears to impact churn rates
   - Payment method shows variation in churn rates

5. Next Steps:
   - Feature engineering (interaction features, temporal features)
   - Handle class imbalance
   - Feature selection and importance analysis
   - Model development and evaluation
"""

churn_rate = data['churn'].mean() * 100
print(insights.format(len(data), len(data.columns), churn_rate))

## 10. Save Processed Data

Save the explored data to the interim directory for further processing.

In [None]:
# Create interim directory if it doesn't exist
interim_dir = Path('../data/interim')
interim_dir.mkdir(parents=True, exist_ok=True)

# Save data
output_path = interim_dir / 'customer_data_explored.csv'
data.to_csv(output_path, index=False)
print(f'Data saved to: {output_path}')