<a href="https://colab.research.google.com/github/moin963khan/Data-/blob/main/Customer_Churn_Analysis_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ----------------------------------------------------------------------------
# PROJECT: Customer Churn Prediction for a Telecom Company
# AUTHOR: Your Name
# DATE: 2025-09-07
# ----------------------------------------------------------------------------

# STEP 1: IMPORT LIBRARIES
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def load_and_clean_data(filepath):
    """
    Loads the dataset from a CSV file and performs initial cleaning.
    - Loads data
    - Converts 'TotalCharges' to numeric and fills missing values
    - Drops the 'customerID' column
    """
    print("Loading and cleaning data...")
    df = pd.read_csv(filepath)

    # Convert TotalCharges to numeric, coercing errors to NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # Fill missing TotalCharges with the median
    median_total_charges = df['TotalCharges'].median()
    df['TotalCharges'].fillna(median_total_charges, inplace=True)

    # Drop customerID as it is not needed for analysis
    df.drop('customerID', axis=1, inplace=True)

    print("Data loading and cleaning complete.")
    return df

def perform_eda(df):
    """
    Performs Exploratory Data Analysis (EDA) and saves key visualizations.
    - Plots overall churn rate
    - Plots churn by contract type
    - Plots churn by internet service
    """
    print("Performing Exploratory Data Analysis (EDA)...")
    sns.set_style('whitegrid')

    # 1. Overall Churn Rate Pie Chart
    plt.figure(figsize=(7, 7))
    churn_counts = df['Churn'].value_counts()
    plt.pie(churn_counts, labels=churn_counts.index, autopct='%1.1f%%', startangle=140, colors=['#4CAF50','#F44336'])
    plt.title('Overall Customer Churn Rate', fontsize=16)
    plt.savefig('churn_rate_pie_chart.png', bbox_inches='tight')
    plt.close()
    print("Saved: churn_rate_pie_chart.png")

    # 2. Churn by Contract Type
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Contract', hue='Churn', data=df, palette='viridis')
    plt.title('Churn Rate by Contract Type', fontsize=16)
    plt.xlabel('Contract Type', fontsize=12)
    plt.ylabel('Number of Customers', fontsize=12)
    plt.legend(title='Churn')
    plt.savefig('churn_by_contract.png', bbox_inches='tight')
    plt.close()
    print("Saved: churn_by_contract.png")

    # 3. Churn by Internet Service
    plt.figure(figsize=(10, 6))
    sns.countplot(x='InternetService', hue='Churn', data=df, palette='plasma')
    plt.title('Churn Rate by Internet Service', fontsize=16)
    plt.xlabel('Internet Service Type', fontsize=12)
    plt.ylabel('Number of Customers', fontsize=12)
    plt.legend(title='Churn')
    plt.savefig('churn_by_internet_service.png', bbox_inches='tight')
    plt.close()
    print("Saved: churn_by_internet_service.png")

    print("EDA complete. Visualizations saved.")

def prepare_data_for_modeling(df):
    """
    Prepares the dataframe for machine learning.
    - Converts categorical features to numerical using one-hot encoding
    """
    print("Preparing data for modeling...")
    # Create a copy to avoid modifying the original EDA dataframe
    df_model = df.copy()

    # Convert binary 'Yes'/'No' columns to 1/0
    binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
    for col in binary_cols:
        if col in df_model.columns:
            df_model[col] = df_model[col].apply(lambda x: 1 if x == 'Yes' else 0)

    # Use one-hot encoding for other categorical columns
    df_model = pd.get_dummies(df_model, drop_first=True)

    print("Data preparation complete.")
    return df_model

def train_and_evaluate_model(df_model):
    """
    Trains a Logistic Regression model and evaluates its performance.
    """
    print("Training and evaluating the model...")

    # Define features (X) and target (y)
    X = df_model.drop('Churn', axis=1)
    y = df_model['Churn']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Initialize and train the model
    model = LogisticRegression(max_iter=1000, solver='liblinear')
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions)
    print("\n--- Model Evaluation ---")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, predictions, target_names=['No Churn', 'Churn']))

    # Plot Confusion Matrix
    cm = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Churn', 'Churn'],
                yticklabels=['No Churn', 'Churn'])
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.title('Confusion Matrix', fontsize=16)
    plt.savefig('confusion_matrix.png', bbox_inches='tight')
    plt.close()
    print("Saved: confusion_matrix.png")
    print("--- End of Evaluation ---")


def main():
    """Main function to run the entire pipeline."""
    FILEPATH = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'

    # Step 1: Load and Clean
    df = load_and_clean_data(FILEPATH)

    # Step 2: Perform EDA
    perform_eda(df)

    # Step 3: Prepare data for ML
    df_model = prepare_data_for_modeling(df)

    # Step 4: Train and Evaluate Model
    train_and_evaluate_model(df_model)

    print("\nProject execution finished successfully!")

if __name__ == '__main__':
    main()

Loading and cleaning data...
Data loading and cleaning complete.
Performing Exploratory Data Analysis (EDA)...
Saved: churn_rate_pie_chart.png


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(median_total_charges, inplace=True)


Saved: churn_by_contract.png
Saved: churn_by_internet_service.png
EDA complete. Visualizations saved.
Preparing data for modeling...
Data preparation complete.
Training and evaluating the model...

--- Model Evaluation ---
Accuracy: 0.8041

Classification Report:
              precision    recall  f1-score   support

    No Churn       0.85      0.90      0.87      1035
       Churn       0.66      0.55      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.80      0.80      1409

Saved: confusion_matrix.png
--- End of Evaluation ---

Project execution finished successfully!
