# Customer Churn Prediction - Part 1: Data Exploration & Preprocessing

## Overview
This notebook covers:
1. Data Loading and Initial Exploration
2. Data Quality Assessment
3. Exploratory Data Analysis (EDA)
4. Data Preprocessing
5. Feature Engineering


## Step 1: Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")


## Step 2: Load Dataset


In [None]:
# Load the dataset
# Note: Update the path to your dataset location
df = pd.read_csv('data/customer_data.csv')

print(f"Dataset loaded successfully!")
print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")


## Step 3: Initial Data Exploration


In [None]:
# Display first few rows
df.head(10)


In [None]:
# Display column information
df.info()


In [None]:
# Display basic statistics
df.describe()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percent.values
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Values Found:")
    print(missing_df)
else:
    print("No missing values found in the dataset!")


In [None]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


## Step 4: Target Variable Analysis (Churn)


In [None]:
# Churn distribution
churn_counts = df['Churn'].value_counts()
churn_percentages = df['Churn'].value_counts(normalize=True) * 100

print("Churn Distribution:")
print(churn_counts)
print("\nChurn Percentages:")
print(churn_percentages)

# Visualize churn distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
churn_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Churn Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Churn', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
churn_percentages.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['#2ecc71', '#e74c3c'])
axes[1].set_title('Churn Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

print(f"\nChurn Rate: {churn_percentages['Yes']:.2f}%")


## Step 5: Exploratory Data Analysis (EDA)

### 5.1 Categorical Features Analysis


In [None]:
# List of categorical columns
categorical_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
                   'PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies',
                   'Contract', 'PaperlessBilling', 'PaymentMethod']

# Analyze churn rate by each categorical feature
def analyze_categorical_churn(df, col):
    churn_by_category = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    churn_by_category.columns = ['No Churn %', 'Churn %']
    return churn_by_category.sort_values('Churn %', ascending=False)

# Analyze key categorical features
key_categorical = ['Contract', 'PaymentMethod', 'InternetService', 'OnlineSecurity']

for col in key_categorical:
    print(f"\n{'='*50}")
    print(f"Churn Analysis for: {col}")
    print(f"{'='*50}")
    result = analyze_categorical_churn(df, col)
    print(result)
    
    # Visualization
    plt.figure(figsize=(10, 6))
    result['Churn %'].plot(kind='barh', color='#e74c3c')
    plt.title(f'Churn Rate by {col}', fontsize=14, fontweight='bold')
    plt.xlabel('Churn Percentage (%)', fontsize=12)
    plt.ylabel(col, fontsize=12)
    plt.tight_layout()
    plt.show()


### 5.2 Numerical Features Analysis


In [None]:
# Numerical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Check TotalCharges data type (might be object if it has spaces)
print(f"TotalCharges data type: {df['TotalCharges'].dtype}")
print(f"\nSample TotalCharges values:")
print(df['TotalCharges'].head(10))


In [None]:
# Convert TotalCharges to numeric (handling any non-numeric values)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values after conversion
print(f"Missing values in TotalCharges: {df['TotalCharges'].isnull().sum()}")

# Display statistics
print("\nNumerical Features Statistics:")
print(df[numerical_cols].describe())


In [None]:
# Visualize numerical features distribution by churn
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(numerical_cols):
    # Box plot
    df.boxplot(column=col, by='Churn', ax=axes[idx])
    axes[idx].set_title(f'{col} by Churn', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Churn', fontsize=10)
    axes[idx].set_ylabel(col, fontsize=10)

plt.suptitle('Numerical Features Distribution by Churn', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Histogram distribution
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, col in enumerate(numerical_cols):
    # Churn = No
    df[df['Churn'] == 'No'][col].hist(ax=axes[0, idx], alpha=0.7, label='No Churn', color='#2ecc71')
    # Churn = Yes
    df[df['Churn'] == 'Yes'][col].hist(ax=axes[0, idx], alpha=0.7, label='Churn', color='#e74c3c')
    axes[0, idx].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
    axes[0, idx].set_xlabel(col, fontsize=10)
    axes[0, idx].set_ylabel('Frequency', fontsize=10)
    axes[0, idx].legend()
    
    # Density plot
    df[df['Churn'] == 'No'][col].plot(kind='density', ax=axes[1, idx], label='No Churn', color='#2ecc71')
    df[df['Churn'] == 'Yes'][col].plot(kind='density', ax=axes[1, idx], label='Churn', color='#e74c3c')
    axes[1, idx].set_title(f'{col} Density Plot', fontsize=12, fontweight='bold')
    axes[1, idx].set_xlabel(col, fontsize=10)
    axes[1, idx].set_ylabel('Density', fontsize=10)
    axes[1, idx].legend()

plt.tight_layout()
plt.show()


### 5.3 Correlation Analysis


In [None]:
# Calculate correlation matrix for numerical features
correlation_matrix = df[numerical_cols].corr()

# Visualize correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nCorrelation Matrix:")
print(correlation_matrix)


## Step 6: Data Preprocessing

### 6.1 Handle Missing Values


In [None]:
# Check missing values again
print("Missing values before handling:")
print(df.isnull().sum())

# Fill missing values in TotalCharges
# Missing TotalCharges likely means new customers (tenure = 0)
df['TotalCharges'].fillna(0, inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())
print("\nAll missing values handled!")


### 6.2 Handle Inconsistent Categorical Values


In [None]:
# Replace 'No internet service' and 'No phone service' with 'No'
columns_to_fix = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                  'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']

for col in columns_to_fix:
    df[col] = df[col].replace(['No internet service', 'No phone service'], 'No')

print("Categorical values standardized!")


### 6.3 Feature Engineering


In [None]:
# Create new features

# Average charge per month (for customers with tenure > 0)
df['AvgChargePerMonth'] = df.apply(
    lambda x: x['TotalCharges'] / x['tenure'] if x['tenure'] > 0 else 0, axis=1
)

# Tenure groups
def categorize_tenure(tenure):
    if tenure <= 12:
        return '0-12'
    elif tenure <= 24:
        return '13-24'
    elif tenure <= 48:
        return '25-48'
    else:
        return '49+'

df['TenureGroup'] = df['tenure'].apply(categorize_tenure)

# Count of services (excluding basic phone/internet)
service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies']
df['ServiceCount'] = df[service_cols].apply(
    lambda x: sum(x == 'Yes'), axis=1
)

print("Feature engineering completed!")
print(f"\nNew features created:")
print("- AvgChargePerMonth")
print("- TenureGroup")
print("- ServiceCount")


### 6.4 Encode Categorical Variables


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Binary encoding for Yes/No columns
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies']

for col in binary_cols:
    df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0})

# Gender encoding
df_processed['gender'] = df_processed['gender'].map({'Male': 1, 'Female': 0})

# MultipleLines encoding (already handled 'No phone service')
df_processed['MultipleLines'] = df_processed['MultipleLines'].map({'Yes': 1, 'No': 0})

# One-hot encoding for multi-category columns
multi_category_cols = ['InternetService', 'Contract', 'PaymentMethod', 'TenureGroup']

df_processed = pd.get_dummies(df_processed, columns=multi_category_cols, prefix=multi_category_cols)

print("Categorical encoding completed!")
print(f"\nNew shape: {df_processed.shape}")


### 6.5 Prepare Features and Target


In [None]:
# Separate features and target
X = df_processed.drop(['customerID', 'Churn'], axis=1)
y = df_processed['Churn'].map({'Yes': 1, 'No': 0})

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")
print(f"\nTarget distribution:")
print(y.value_counts())


### 6.6 Save Processed Data


In [None]:
# Save processed data for next notebook
import os
os.makedirs('data', exist_ok=True)

X.to_csv('data/X_processed.csv', index=False)
y.to_csv('data/y_processed.csv', index=False)

print("Processed data saved successfully!")
print("Files saved:")
print("- data/X_processed.csv")
print("- data/y_processed.csv")


## Summary

### Key Findings:
1. Dataset contains customer information with churn status
2. Missing values in TotalCharges handled
3. Categorical variables encoded appropriately
4. New features created for better prediction
5. Data ready for model training

### Next Steps:
- Proceed to Model Building notebook
