# Exploratory Data Analysis: Bank Customer Churn

**Objective:** Analyze the bank customer churn dataset to understand patterns, distributions, and relationships that will inform our prediction model.

**Dataset:** 10,000 bank customers with 14 features

**Target Variable:** `Exited` (1 = churned, 0 = retained)


## 1. Setup and Data Loading


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")


In [None]:
# Load data from Azure ML (if running on compute instance)
# Option 1: Load from Azure ML data asset
try:
    from azure.ai.ml import MLClient
    from azure.identity import DefaultAzureCredential
    
    ml_client = MLClient(
        DefaultAzureCredential(),
        subscription_id="a23fa87c-802c-4fdf-9e59-e3d7969bcf31",
        resource_group_name="rg-churn-ml-project",
        workspace_name="churn-ml-workspace"
    )
    
    # Get data asset
    data_asset = ml_client.data.get("churn-data", version="1")
    df = pd.read_csv(data_asset.path)
    print("✓ Data loaded from Azure ML data asset")
except Exception as e:
    # Option 2: Load from local file (for development)
    print(f"Could not load from Azure ML: {e}")
    print("Loading from local file...")
    df = pd.read_csv('../data/churn.csv')
    print("✓ Data loaded from local file")

print(f"Dataset shape: {df.shape}")


## 2. Initial Data Inspection


In [None]:
# Display first few rows
display(df.head())

# Dataset info
df.info()

# Statistical summary
display(df.describe())


In [None]:
# Check for missing values and duplicates
print("Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found!")
else:
    display(missing[missing > 0])

duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")
if duplicates == 0:
    print("✓ No duplicate rows found!")


## 3. Target Variable Analysis (Churn Rate)


In [None]:
# Churn rate analysis
churn_counts = df['Exited'].value_counts()
churn_rate = df['Exited'].mean()

print("Churn Distribution:")
print(churn_counts)
print(f"\nChurn Rate: {churn_rate:.2%}")
print(f"Retained: {(1-churn_rate):.2%}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
churn_counts.plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Customer Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Exited (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Retained', 'Churned'], rotation=0)

# Pie chart
axes[1].pie(churn_counts, labels=['Retained', 'Churned'], autopct='%1.1f%%', 
            colors=['green', 'red'], startangle=90)
axes[1].set_title('Churn Rate', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n⚠️ Class Imbalance: {churn_rate:.2%} churned vs {(1-churn_rate):.2%} retained")


## 4. Feature Analysis


In [None]:
# Geography analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
df['Geography'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Customer Distribution by Geography', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Country')
axes[0].set_ylabel('Count')

# Churn rate by geography
geo_churn = df.groupby('Geography')['Exited'].mean().sort_values()
geo_churn.plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Churn Rate by Geography', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Country')
axes[1].set_ylabel('Churn Rate')
axes[1].axhline(y=churn_rate, color='red', linestyle='--', label='Overall Churn Rate')
axes[1].legend()

plt.tight_layout()
plt.show()

print("Churn Rate by Geography:")
display(geo_churn)


In [None]:
# Age and Gender analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Gender churn
gender_churn = df.groupby('Gender')['Exited'].mean()
gender_churn.plot(kind='bar', ax=axes[0], color=['salmon', 'lightblue'])
axes[0].set_title('Churn Rate by Gender', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Churn Rate')
axes[0].axhline(y=churn_rate, color='red', linestyle='--', label='Overall')
axes[0].legend()
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Age distribution by churn
df[df['Exited']==0]['Age'].hist(bins=30, alpha=0.5, label='Retained', color='green', ax=axes[1])
df[df['Exited']==1]['Age'].hist(bins=30, alpha=0.5, label='Churned', color='red', ax=axes[1])
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Age Distribution by Churn Status', fontweight='bold')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nKey insights:")
print(f"- Female churn rate: {gender_churn['Female']:.2%}")
print(f"- Male churn rate: {gender_churn['Male']:.2%}")
print(f"- Mean age (churned): {df[df['Exited']==1]['Age'].mean():.1f}")
print(f"- Mean age (retained): {df[df['Exited']==0]['Age'].mean():.1f}")


In [None]:
# Number of Products and Active Member analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Number of products
product_churn = df.groupby('NumOfProducts')['Exited'].mean()
product_churn.plot(kind='bar', ax=axes[0], color='purple')
axes[0].set_title('Churn Rate by Number of Products', fontweight='bold')
axes[0].set_xlabel('Number of Products')
axes[0].set_ylabel('Churn Rate')
axes[0].axhline(y=churn_rate, color='red', linestyle='--', label='Overall')
axes[0].legend()

# Active member
active_churn = df.groupby('IsActiveMember')['Exited'].mean()
active_churn.plot(kind='bar', ax=axes[1], color=['red', 'green'])
axes[1].set_title('Churn Rate by Active Membership', fontweight='bold')
axes[1].set_xlabel('Is Active Member')
axes[1].set_ylabel('Churn Rate')
axes[1].set_xticklabels(['No', 'Yes'], rotation=0)
axes[1].axhline(y=churn_rate, color='blue', linestyle='--', label='Overall')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nKey insights:")
print("Number of Products Churn Rate:")
display(product_churn)
print(f"\nActive members: {active_churn[1]:.2%} churn")
print(f"Inactive members: {active_churn[0]:.2%} churn")


## 5. Correlation Analysis


In [None]:
# Correlation matrix
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
binary_features = ['HasCrCard', 'IsActiveMember', 'Exited']
correlation_features = numerical_features + binary_features

corr_matrix = df[correlation_features].corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Correlation with target
target_corr = corr_matrix['Exited'].sort_values(ascending=False)
print("\nCorrelation with Churn (Exited):")
display(target_corr)


## 6. Key Findings Summary


In [None]:
print("=" * 80)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 80)

print("\n1. OVERALL CHURN RATE:")
print(f"   - Churn rate: {churn_rate:.2%}")
print(f"   - Class imbalance present (20% churned, 80% retained)")

print("\n2. DATA QUALITY:")
print(f"   - Total records: {len(df):,}")
print(f"   - No missing values: ✓")
print(f"   - No duplicates: ✓")

print("\n3. STRONGEST PREDICTORS:")
top_corr = target_corr.drop('Exited').abs().sort_values(ascending=False).head(5)
for feature, corr in top_corr.items():
    print(f"   - {feature}: {corr:.3f}")

print("\n4. GEOGRAPHIC INSIGHTS:")
for country, rate in geo_churn.items():
    print(f"   - {country}: {rate:.2%} churn rate")

print("\n5. DEMOGRAPHIC INSIGHTS:")
print(f"   - Female customers: {gender_churn['Female']:.2%} churn rate")
print(f"   - Male customers: {gender_churn['Male']:.2%} churn rate")
print(f"   - Older customers show higher churn rates")

print("\n6. BEHAVIORAL INSIGHTS:")
print(f"   - Active members: {active_churn[1]:.2%} churn")
print(f"   - Inactive members: {active_churn[0]:.2%} churn")
print(f"   - Customers with 3-4 products have highest churn")

print("\n7. RECOMMENDATIONS FOR MODELING:")
print("   - Handle class imbalance (use SMOTE or class weights)")
print("   - Remove uninformative features: RowNumber, CustomerId, Surname")
print("   - Consider feature engineering: Age groups, Balance categories")
print("   - Scale numerical features")
print("   - Encode categorical variables (Geography, Gender)")

print("\n" + "=" * 80)


## 7. Next Steps

Based on this EDA, the following steps are recommended:

1. **Feature Engineering:**
   - Create age groups
   - Create balance categories
   - Create interaction features (e.g., Age × NumOfProducts)

2. **Data Preparation:**
   - Remove RowNumber, CustomerId, Surname
   - Encode categorical variables (Geography, Gender)
   - Scale numerical features
   - Handle class imbalance with SMOTE or class weights

3. **Model Selection:**
   - Start with Logistic Regression (baseline)
   - Try tree-based models (Random Forest, XGBoost)
   - Ensemble methods

4. **Evaluation:**
   - Use stratified cross-validation
   - Focus on recall for churned customers
   - Use F1-score and AUC-ROC for overall performance
