# Customer Churn - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the Telco Customer Churn dataset.

## Objectives
1. Understand data structure and quality
2. Analyze churn patterns across different customer segments
3. Identify key features correlated with churn
4. Generate insights for feature engineering
5. Create publication-quality visualizations

In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings

warnings.filterwarnings('ignore')

# Import project modules
import config
from download_data import download_telco_churn_data

# Configure plotting
sns.set_style(config.PLOT_STYLE)
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create output directory for figures
config.FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print("✓ Libraries imported successfully")

## 1. Data Loading and Initial Inspection

In [None]:
# Download data if not already present
try:
    data_path = download_telco_churn_data()
    print(f"✓ Data available at: {data_path}")
except Exception as e:
    print(f"Error downloading data: {e}")

In [None]:
# Load the data
df = pd.read_csv(config.RAW_DATA_FILE)

print(f"Dataset Shape: {df.shape}")
print(f"Number of Customers: {len(df):,}")
print(f"Number of Features: {len(df.columns)}")

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and info
df.info()

In [None]:
# Statistical summary
df.describe(include='all').T

## 2. Data Quality Assessment

In [None]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Count', ascending=False)

print("Missing Values Summary:")
print(missing_data[missing_data['Missing_Count'] > 0])

if missing_data['Missing_Count'].sum() == 0:
    print("\n✓ No missing values found!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Check for duplicate customer IDs
duplicate_ids = df['customerID'].duplicated().sum()
print(f"Number of duplicate customer IDs: {duplicate_ids}")

In [None]:
# Handle TotalCharges data type issue
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for customers with zero tenure
zero_tenure = df[df['tenure'] == 0]
print(f"Customers with zero tenure: {len(zero_tenure)}")
print(f"These customers have missing TotalCharges: {zero_tenure['TotalCharges'].isna().sum()}")

## 3. Target Variable Analysis

### Overall Churn Rate

In [None]:
# Calculate churn rate
churn_counts = df['Churn'].value_counts()
churn_rate = (churn_counts['Yes'] / len(df) * 100)

print(f"\nChurn Distribution:")
print(churn_counts)
print(f"\nOverall Churn Rate: {churn_rate:.2f}%")
print(f"Customer Retention Rate: {100-churn_rate:.2f}%")

In [None]:
# Visualization 1: Churn Distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Churn', palette=[config.NO_CHURN_COLOR, config.CHURN_COLOR], ax=ax1)
ax1.set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Churn Status', fontsize=12)
ax1.set_ylabel('Number of Customers', fontsize=12)

# Add value labels
for container in ax1.containers:
    ax1.bar_label(container, fmt='%d')

# Pie chart
colors = [config.NO_CHURN_COLOR, config.CHURN_COLOR]
explode = (0, 0.1)
ax2.pie(churn_counts, labels=['Retained', 'Churned'], autopct='%1.1f%%',
        startangle=90, colors=colors, explode=explode, shadow=True)
ax2.set_title('Churn Rate Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'churn_distribution.png', dpi=config.FIGURE_DPI, bbox_inches='tight')
plt.show()

print("✓ Saved: churn_distribution.png")

## 4. Demographic Analysis

In [None]:
# Visualization 2: Churn by Demographics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

demographic_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents']

for idx, feature in enumerate(demographic_features):
    ax = axes[idx // 2, idx % 2]
    
    # Create crosstab
    ct = pd.crosstab(df[feature], df['Churn'], normalize='index') * 100
    
    ct.plot(kind='bar', ax=ax, color=[config.NO_CHURN_COLOR, config.CHURN_COLOR], width=0.7)
    ax.set_title(f'Churn Rate by {feature}', fontsize=12, fontweight='bold')
    ax.set_xlabel(feature, fontsize=11)
    ax.set_ylabel('Percentage (%)', fontsize=11)
    ax.legend(['No Churn', 'Churn'], loc='upper right')
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%')

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'churn_by_demographics.png', dpi=config.FIGURE_DPI, bbox_inches='tight')
plt.show()

print("✓ Saved: churn_by_demographics.png")

## 5. Service Usage Analysis

In [None]:
# Visualization 3: Churn by Services
service_features = ['PhoneService', 'MultipleLines', 'InternetService', 
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                   'TechSupport', 'StreamingTV', 'StreamingMovies']

# Calculate churn rate for each service
service_churn_rates = []
for feature in service_features:
    churn_rate_by_service = df.groupby(feature)['Churn'].apply(
        lambda x: (x == 'Yes').sum() / len(x) * 100
    ).to_dict()
    service_churn_rates.append(churn_rate_by_service)

# Create interactive plotly figure
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=service_features,
    vertical_spacing=0.12,
    horizontal_spacing=0.1
)

for idx, feature in enumerate(service_features):
    row = idx // 3 + 1
    col = idx % 3 + 1
    
    ct = pd.crosstab(df[feature], df['Churn'], normalize='index') * 100
    
    fig.add_trace(
        go.Bar(name='No Churn', x=ct.index, y=ct['No'], marker_color=config.NO_CHURN_COLOR),
        row=row, col=col
    )
    fig.add_trace(
        go.Bar(name='Churn', x=ct.index, y=ct['Yes'], marker_color=config.CHURN_COLOR),
        row=row, col=col
    )

fig.update_layout(
    height=1000,
    title_text="Churn Rate by Service Features",
    showlegend=True,
    template=config.PLOTLY_TEMPLATE
)

fig.update_xaxes(tickangle=45)
fig.update_yaxes(title_text="Percentage (%)")

fig.write_html(config.FIGURES_DIR / 'churn_by_services.html')
fig.show()

print("✓ Saved: churn_by_services.html")

## 6. Contract and Payment Analysis

In [None]:
# Visualization 4: Contract and Payment Method Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Contract Type
ct_contract = pd.crosstab(df['Contract'], df['Churn'], normalize='index') * 100
ct_contract.plot(kind='bar', ax=axes[0, 0], color=[config.NO_CHURN_COLOR, config.CHURN_COLOR])
axes[0, 0].set_title('Churn Rate by Contract Type', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Percentage (%)', fontsize=11)
axes[0, 0].legend(['No Churn', 'Churn'])
axes[0, 0].tick_params(axis='x', rotation=45)

# Payment Method
ct_payment = pd.crosstab(df['PaymentMethod'], df['Churn'], normalize='index') * 100
ct_payment.plot(kind='bar', ax=axes[0, 1], color=[config.NO_CHURN_COLOR, config.CHURN_COLOR])
axes[0, 1].set_title('Churn Rate by Payment Method', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Percentage (%)', fontsize=11)
axes[0, 1].legend(['No Churn', 'Churn'])
axes[0, 1].tick_params(axis='x', rotation=45)

# Paperless Billing
ct_billing = pd.crosstab(df['PaperlessBilling'], df['Churn'], normalize='index') * 100
ct_billing.plot(kind='bar', ax=axes[1, 0], color=[config.NO_CHURN_COLOR, config.CHURN_COLOR])
axes[1, 0].set_title('Churn Rate by Paperless Billing', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Percentage (%)', fontsize=11)
axes[1, 0].legend(['No Churn', 'Churn'])
axes[1, 0].tick_params(axis='x', rotation=45)

# Combined: Contract + Payment Method
contract_payment_churn = df.groupby(['Contract', 'PaymentMethod'])['Churn'].apply(
    lambda x: (x == 'Yes').sum() / len(x) * 100
).unstack()
contract_payment_churn.plot(kind='bar', ax=axes[1, 1], width=0.8)
axes[1, 1].set_title('Churn Rate by Contract and Payment Method', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Churn Rate (%)', fontsize=11)
axes[1, 1].legend(title='Payment Method', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'churn_by_contract_payment.png', dpi=config.FIGURE_DPI, bbox_inches='tight')
plt.show()

print("✓ Saved: churn_by_contract_payment.png")

## 7. Numerical Features Analysis

In [None]:
# Visualization 5: Distribution of Numerical Features
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, feature in enumerate(numerical_features):
    # Distribution plot
    ax1 = axes[0, idx]
    df[df['Churn'] == 'No'][feature].hist(bins=30, alpha=0.7, label='No Churn', 
                                           color=config.NO_CHURN_COLOR, ax=ax1)
    df[df['Churn'] == 'Yes'][feature].hist(bins=30, alpha=0.7, label='Churn', 
                                            color=config.CHURN_COLOR, ax=ax1)
    ax1.set_title(f'{feature} Distribution by Churn', fontsize=12, fontweight='bold')
    ax1.set_xlabel(feature, fontsize=11)
    ax1.set_ylabel('Frequency', fontsize=11)
    ax1.legend()
    
    # Box plot
    ax2 = axes[1, idx]
    df.boxplot(column=feature, by='Churn', ax=ax2, 
               patch_artist=True,
               boxprops=dict(facecolor=config.PRIMARY_COLOR))
    ax2.set_title(f'{feature} by Churn Status', fontsize=12, fontweight='bold')
    ax2.set_xlabel('Churn', fontsize=11)
    ax2.set_ylabel(feature, fontsize=11)
    plt.sca(ax2)
    plt.xticks([1, 2], ['No', 'Yes'])

plt.suptitle('Numerical Features Analysis', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'numerical_features_analysis.png', dpi=config.FIGURE_DPI, bbox_inches='tight')
plt.show()

print("✓ Saved: numerical_features_analysis.png")

In [None]:
# Statistical comparison
print("\nNumerical Features Summary by Churn Status:\n")
for feature in numerical_features:
    print(f"{feature}:")
    print(df.groupby('Churn')[feature].describe().round(2))
    print("\n" + "="*60 + "\n")

## 8. Correlation Analysis

In [None]:
# Prepare data for correlation
df_corr = df.copy()

# Convert categorical to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

categorical_cols = df_corr.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'customerID':
        df_corr[col] = le.fit_transform(df_corr[col].astype(str))

# Drop customerID
df_corr = df_corr.drop('customerID', axis=1, errors='ignore')

In [None]:
# Visualization 6: Correlation Heatmap
plt.figure(figsize=(14, 12))

# Calculate correlation matrix
corr_matrix = df_corr.corr()

# Create mask for upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Create heatmap
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, 
            square=True, linewidths=1, 
            cbar_kws={"shrink": 0.8})

plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(config.FIGURES_DIR / 'correlation_heatmap.png', dpi=config.FIGURE_DPI, bbox_inches='tight')
plt.show()

print("✓ Saved: correlation_heatmap.png")

In [None]:
# Features most correlated with Churn
churn_correlations = corr_matrix['Churn'].sort_values(ascending=False)
print("\nFeatures Most Correlated with Churn:\n")
print(churn_correlations[1:11])  # Exclude Churn itself

## 9. Tenure Analysis

In [None]:
# Visualization 7: Tenure vs Churn
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Tenure Distribution by Churn', 'Churn Rate by Tenure Group'),
    specs=[[{"type": "histogram"}, {"type": "bar"}]]
)

# Histogram
fig.add_trace(
    go.Histogram(x=df[df['Churn'] == 'No']['tenure'], name='No Churn', 
                marker_color=config.NO_CHURN_COLOR, opacity=0.7),
    row=1, col=1
)
fig.add_trace(
    go.Histogram(x=df[df['Churn'] == 'Yes']['tenure'], name='Churn', 
                marker_color=config.CHURN_COLOR, opacity=0.7),
    row=1, col=1
)

# Create tenure groups
df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 12, 24, 36, 48, 60, 72], 
                            labels=['0-1yr', '1-2yr', '2-3yr', '3-4yr', '4-5yr', '5-6yr'])

tenure_churn = df.groupby('tenure_group')['Churn'].apply(
    lambda x: (x == 'Yes').sum() / len(x) * 100
)

fig.add_trace(
    go.Bar(x=tenure_churn.index, y=tenure_churn.values, 
          marker_color=config.CHURN_COLOR, name='Churn Rate'),
    row=1, col=2
)

fig.update_layout(
    height=500,
    title_text="Tenure Analysis",
    showlegend=True,
    template=config.PLOTLY_TEMPLATE
)

fig.update_xaxes(title_text="Tenure (months)", row=1, col=1)
fig.update_xaxes(title_text="Tenure Group", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Churn Rate (%)", row=1, col=2)

fig.write_html(config.FIGURES_DIR / 'tenure_analysis.html')
fig.show()

print("✓ Saved: tenure_analysis.html")

## 10. Key Findings Summary

In [None]:
findings = """
KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS
==========================================

1. OVERALL CHURN RATE
   - Overall churn rate: {churn_rate:.2f}%
   - Dataset is imbalanced, requiring SMOTE or class weights

2. HIGH-RISK CUSTOMER SEGMENTS
   - Month-to-month contracts have significantly higher churn ({month_to_month_churn:.1f}%)
   - Electronic check payment method shows elevated churn risk
   - Customers without tech support are more likely to churn
   - Fiber optic internet users have higher churn rates

3. PROTECTIVE FACTORS
   - Long-term contracts (1-2 years) show much lower churn
   - Customers with longer tenure are more loyal
   - Having multiple services reduces churn likelihood

4. NUMERICAL INSIGHTS
   - Higher monthly charges correlate with increased churn
   - New customers (low tenure) are at highest risk
   - Total charges reflect both tenure and pricing effects

5. FEATURE ENGINEERING OPPORTUNITIES
   - Create tenure bins to capture non-linear relationships
   - Combine service features into "total services" count
   - Calculate charges per tenure month as efficiency metric
   - Interaction between contract type and payment method

6. BUSINESS RECOMMENDATIONS
   - Target month-to-month customers for contract upgrade incentives
   - Implement early engagement program for customers in first year
   - Promote value-added services (tech support, security) to at-risk segments
   - Review pricing strategy for high monthly charge customers
   - Encourage automatic payment methods over electronic checks
""".format(
    churn_rate=churn_rate,
    month_to_month_churn=df[df['Contract'] == 'Month-to-month']['Churn'].apply(lambda x: 1 if x == 'Yes' else 0).mean() * 100
)

print(findings)

# Save findings to file
with open(config.REPORTS_DIR / 'eda_findings.txt', 'w') as f:
    f.write(findings)

print("\n✓ Saved findings to: eda_findings.txt")

In [None]:
print("\n" + "="*60)
print("EDA COMPLETE!")
print("="*60)
print(f"\nGenerated Visualizations:")
print(f"  1. churn_distribution.png")
print(f"  2. churn_by_demographics.png")
print(f"  3. churn_by_services.html")
print(f"  4. churn_by_contract_payment.png")
print(f"  5. numerical_features_analysis.png")
print(f"  6. correlation_heatmap.png")
print(f"  7. tenure_analysis.html")
print(f"\nAll visualizations saved to: {config.FIGURES_DIR}")
print(f"Findings saved to: {config.REPORTS_DIR / 'eda_findings.txt'}")