In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, confusion_matrix, roc_auc_score, roc_curve,
                            classification_report)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_imb_pipeline
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('/content/telco_customer_churn.csv')

# ======================
# 1. Dataset Description
# ======================
print("="*50)
print("1. DATASET DESCRIPTION")
print("="*50)
print("\nDataset Shape:", data.shape)
print("\nFirst 5 Rows:")
print(data.head())
print("\nData Types:")
print(data.dtypes)
print("\nMissing Values:")
print(data.isnull().sum())
print("\nData Description (Numerical Features):")
print(data.describe())

# Check for empty strings which might represent missing values
print("\nChecking for empty strings in categorical columns:")
for col in data.select_dtypes(include=['object']).columns:
    empty_count = (data[col] == '').sum()
    if empty_count > 0:
        print(f"{col}: {empty_count} empty values")

# Handle the missing value in TotalCharges (found as empty string)
# Changed inplace operation to avoid FutureWarning
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
print("\nMissing Values after conversion:")
print(data.isnull().sum())

# Fill missing TotalCharges with 0 (likely new customers)
data['TotalCharges'] = data['TotalCharges'].fillna(0)

# ======================
# 2. Correlation Analysis
# ======================
print("\n" + "="*50)
print("2. CORRELATION ANALYSIS")
print("="*50)

# Convert target to numerical for correlation
data['Churn_numeric'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

plt.figure(figsize=(15, 10))
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = data[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Top correlations with Churn
print("\nTop features correlated with Churn:")
corr_with_churn = corr_matrix['Churn_numeric'].sort_values(ascending=False)
print(corr_with_churn)

# ======================
# 3. Exploratory Data Analysis
# ======================
print("\n" + "="*50)
print("3. EXPLORATORY DATA ANALYSIS")
print("="*50)

# 3.1 Target Variable Distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='Churn', data=data)
plt.title("Distribution of Churn")

plt.subplot(1, 2, 2)
data['Churn'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Churn Percentage")
plt.ylabel("")
plt.tight_layout()
plt.show()

# 3.2 Categorical Features vs Churn
plt.figure(figsize=(18, 15))

plt.subplot(3, 3, 1)
sns.countplot(x='gender', hue='Churn', data=data)
plt.title("Churn by Gender")

plt.subplot(3, 3, 2)
sns.countplot(x='SeniorCitizen', hue='Churn', data=data)
plt.title("Churn by Senior Citizen Status")

plt.subplot(3, 3, 3)
sns.countplot(x='Partner', hue='Churn', data=data)
plt.title("Churn by Partner Status")

plt.subplot(3, 3, 4)
sns.countplot(x='Dependents', hue='Churn', data=data)
plt.title("Churn by Dependents")

plt.subplot(3, 3, 5)
sns.countplot(x='PhoneService', hue='Churn', data=data)
plt.title("Churn by Phone Service")

plt.subplot(3, 3, 6)
sns.countplot(x='InternetService', hue='Churn', data=data)
plt.title("Churn by Internet Service")

plt.subplot(3, 3, 7)
sns.countplot(x='Contract', hue='Churn', data=data)
plt.title("Churn by Contract Type")

plt.subplot(3, 3, 8)
sns.countplot(x='PaperlessBilling', hue='Churn', data=data)
plt.title("Churn by Paperless Billing")

plt.subplot(3, 3, 9)
sns.countplot(x='PaymentMethod', hue='Churn', data=data)
plt.xticks(rotation=45)
plt.title("Churn by Payment Method")
plt.tight_layout()
plt.show()

# 3.3 Numerical Features vs Churn
plt.figure(figsize=(18, 10))

plt.subplot(2, 3, 1)
sns.boxplot(x='Churn', y='tenure', data=data)
plt.title("Churn by Tenure")

plt.subplot(2, 3, 2)
sns.boxplot(x='Churn', y='MonthlyCharges', data=data)
plt.title("Churn by Monthly Charges")

plt.subplot(2, 3, 3)
sns.boxplot(x='Churn', y='TotalCharges', data=data)
plt.title("Churn by Total Charges")

# Create tenure groups
data['tenure_group'] = pd.cut(data['tenure'], bins=[0, 12, 24, 36, 48, 60, 72],
                            labels=['0-12', '12-24', '24-36', '36-48', '48-60', '60-72'])

plt.subplot(2, 3, 4)
sns.countplot(x='tenure_group', hue='Churn', data=data)
plt.title("Churn by Tenure Groups")
plt.xticks(rotation=45)

plt.subplot(2, 3, 5)
sns.histplot(data=data, x='MonthlyCharges', hue='Churn', element='step', stat='density', common_norm=False)
plt.title("Monthly Charges Distribution by Churn")

plt.subplot(2, 3, 6)
sns.histplot(data=data, x='TotalCharges', hue='Churn', element='step', stat='density', common_norm=False)
plt.title("Total Charges Distribution by Churn")
plt.tight_layout()
plt.show()

# ======================
# 4. Data Preprocessing
# ======================
print("\n" + "="*50)
print("4. DATA PREPROCESSING")
print("="*50)

# Drop unnecessary columns
data = data.drop(['customerID', 'Churn_numeric', 'tenure_group'], axis=1)

# Separate features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Convert target to binary
y = y.map({'Yes': 1, 'No': 0})

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns

print("\nCategorical Features:", list(categorical_cols))
print("\nNumerical Features:", list(numeric_cols))

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Check class distribution
print("\nClass distribution in training set:")
print(pd.Series(y_train).value_counts(normalize=True))

# ======================
# 5. Model Training
# ======================
print("\n" + "="*50)
print("5. MODEL TRAINING")
print("="*50)

# Function to train and evaluate models
def train_evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    print(f"\nTraining {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Churn', 'Churn'],
                yticklabels=['No Churn', 'Churn'])
    plt.title(f"Confusion Matrix - {model_name}")
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

    # Plot ROC curve if probabilities are available
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(6, 4))
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend()
        plt.show()

    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

# Create a function to handle model training with SMOTE
def train_model_with_smote(model, params, X_train, y_train):
    # Create pipeline with SMOTE and the model
    pipeline = make_imb_pipeline(
        preprocessor,
        SMOTE(random_state=42),
        model
    )

    # Grid search with cross-validation
    grid = GridSearchCV(
        pipeline,
        params,
        cv=StratifiedKFold(n_splits=5),
        scoring='f1',
        n_jobs=-1
    )

    grid.fit(X_train, y_train)
    return grid.best_estimator_

# Decision Tree
dt_params = {
    'decisiontreeclassifier__max_depth': [3, 5, 7, None],
    'decisiontreeclassifier__min_samples_split': [2, 5, 10],
    'decisiontreeclassifier__criterion': ['gini', 'entropy']
}

best_dt = train_model_with_smote(
    DecisionTreeClassifier(random_state=42),
    dt_params,
    X_train,
    y_train
)

dt_results = train_evaluate_model(best_dt, "Decision Tree",
                                X_train, y_train, X_test, y_test)

# Logistic Regression
lr_params = {
    'logisticregression__C': [0.01, 0.1, 1, 10],
    'logisticregression__solver': ['liblinear', 'lbfgs'],
    'logisticregression__class_weight': ['balanced', None]
}

best_lr = train_model_with_smote(
    LogisticRegression(random_state=42, max_iter=1000),
    lr_params,
    X_train,
    y_train
)

lr_results = train_evaluate_model(best_lr, "Logistic Regression",
                                X_train, y_train, X_test, y_test)

# Neural Network
nn_params = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50, 30)],
    'mlpclassifier__alpha': [0.0001, 0.001],
    'mlpclassifier__activation': ['relu', 'tanh'],
    'mlpclassifier__learning_rate_init': [0.001, 0.01]
}

best_nn = train_model_with_smote(
    MLPClassifier(
        random_state=42,
        max_iter=1000,
        early_stopping=True,
        validation_fraction=0.2,
        n_iter_no_change=20,
        learning_rate='adaptive'
    ),
    nn_params,
    X_train,
    y_train
)

nn_results = train_evaluate_model(best_nn, "Neural Network",
                                X_train, y_train, X_test, y_test)

# ======================
# 6. Model Comparison
# ======================
print("\n" + "="*50)
print("6. MODEL COMPARISON")
print("="*50)

# Gather all results
results = pd.DataFrame([dt_results, lr_results, nn_results])

# Plot comparison metrics
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.barplot(x='model', y='accuracy', data=results)
plt.title("Accuracy Comparison")
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.subplot(2, 2, 2)
sns.barplot(x='model', y='precision', data=results)
plt.title("Precision Comparison")
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.subplot(2, 2, 3)
sns.barplot(x='model', y='recall', data=results)
plt.title("Recall Comparison")
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.subplot(2, 2, 4)
sns.barplot(x='model', y='f1', data=results)
plt.title("F1 Score Comparison")
plt.xticks(rotation=45)
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

# ROC Curve comparison
plt.figure(figsize=(10, 8))
models = {
    'Decision Tree': best_dt,
    'Logistic Regression': best_lr,
    'Neural Network': best_nn
}

for name, model in models.items():
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        auc = roc_auc_score(y_test, y_prob)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()

# Display all results
print("\nModel Performance Comparison:")
print(results)

# Best Parameters
print("\nBest Decision Tree Parameters:")
print(best_dt.named_steps['decisiontreeclassifier'].get_params())

print("\nBest Logistic Regression Parameters:")
print(best_lr.named_steps['logisticregression'].get_params())

print("\nBest Neural Network Parameters:")
print(best_nn.named_steps['mlpclassifier'].get_params())

# Feature Importance (Decision Tree)
if hasattr(best_dt.named_steps['decisiontreeclassifier'], 'feature_importances_'):
    # Get the fitted preprocessor from the pipeline
    fitted_preprocessor = best_dt.named_steps['columntransformer']

    # Get feature names after one-hot encoding
    ohe = fitted_preprocessor.named_transformers_['cat']
    cat_features = ohe.get_feature_names_out(categorical_cols)
    all_features = np.concatenate([numeric_cols, cat_features])

    # Get importance scores
    importances = best_dt.named_steps['decisiontreeclassifier'].feature_importances_
    indices = np.argsort(importances)[-10:]  # Top 10 features

    plt.figure(figsize=(12, 8))
    plt.title("Top 10 Important Features (Decision Tree)")
    plt.barh(range(10), importances[indices], align='center')
    plt.yticks(range(10), [all_features[i] for i in indices])
    plt.xlabel("Relative Importance")
    plt.tight_layout()
    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '/content/telco_customer_churn.csv'

# New Section