<a href="https://colab.research.google.com/github/mab2004/ml-churn-prediction-pipeline/blob/main/ml_churn_prediction_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Data Loading

In [None]:
!pip install scikit-learn pandas joblib

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

FILE_PATH = "Telco-Customer-Churn.csv"

# Load the dataset
df = pd.read_csv(FILE_PATH)

# Data Cleaning and Preparation
df.drop('customerID', axis=1, inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data loaded, cleaned, and split successfully.")

Data loaded, cleaned, and split successfully.


## Define Preprocessing Pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Identify features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Numerical Pipeline: Impute (median) -> Scale (StandardScaler)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline: Impute (most frequent) -> Encode (OneHotEncoder)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough',
    n_jobs=-1
)

print("Preprocessing pipeline defined.")

Preprocessing pipeline defined.


## Hyperparameter Tuning with GridSearchCV

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Define the Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_leaf': [2, 4]
}

# Create and fit GridSearchCV
grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

print("Starting GridSearchCV...")
grid_search.fit(X_train, y_train)
print("GridSearchCV complete.")

# Output results
best_pipeline = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)
print(f"Best CV Score: {grid_search.best_score_:.4f}")
print(f"Final Test Set Accuracy: {best_pipeline.score(X_test, y_test):.4f}")

Starting GridSearchCV...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
GridSearchCV complete.

Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__n_estimators': 100}
Best CV Score: 0.8017
Final Test Set Accuracy: 0.8091


## Export the Complete Pipeline

In [8]:
import joblib

filename = 'churn_prediction_pipeline.joblib'

# Export the fully-tuned pipeline
joblib.dump(best_pipeline, filename)

print(f"Pipeline successfully exported to {filename}.")

# Verification (load and test)
loaded_pipeline = joblib.load(filename)
print("Pipeline loaded successfully.")
print(f"Loaded Pipeline Test Accuracy: {loaded_pipeline.score(X_test, y_test):.4f}")

Pipeline successfully exported to churn_prediction_pipeline.joblib.
Pipeline loaded successfully.
Loaded Pipeline Test Accuracy: 0.8091
