In [None]:
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization based on Matplotlib
from tabulate import tabulate  # For pretty-printing tabular data

from sklearn import metrics  # For model evaluation metrics
from sklearn.ensemble import RandomForestClassifier  # For building a random forest classifier
from sklearn.model_selection import train_test_split  # To split data into train and test sets
from sklearn.metrics import recall_score  # To calculate recall score for classification models
from sklearn.metrics import classification_report  # To generate classification performance report
from sklearn.metrics import confusion_matrix  # To create a confusion matrix for model evaluation
from sklearn.tree import DecisionTreeClassifier  # For building a Decision Tree classifier
from imblearn.combine import SMOTEENN  # For handling class imbalance using SMOTE and ENN
import pickle # For storing and loading Python objects
from sklearn.pipeline import Pipeline # For creating a machine learning Pipeline
from sklearn.compose import ColumnTransformer # For applying transformations to specific columns of a DataFrame
from sklearn.preprocessing import OneHotEncoder # For converting categorical variables into dummy/indicator variables
from imblearn.pipeline import Pipeline as ImbPipeline # For creating a pipeline that can handle imbalanced datasets
from sklearn.base import BaseEstimator, TransformerMixin # For creating custom transformers

In [44]:
# Load dataset from CSV file
df = pd.read_csv(r"D:\Stuff\Data Science\Machine Learning Models\Customer_churn_model\churn_updated.csv")
# Display the first few rows of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 - 12
1,1,Male,0,No,No,Yes,No,DSL,Yes,No,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25 - 36
2,2,Male,0,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 - 12
3,3,Male,0,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37 - 48
4,4,Female,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 - 12


In [None]:
# Convert TotalCharges to numeric
class ConvertTotalCharges(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['TotalCharges'] = pd.to_numeric(X['TotalCharges'], errors='coerce')
        return X


In [None]:
# Separate target
X = df.copy()
y = np.where(X['Churn'] == 'Yes', 1, 0)
X = X.drop('Churn', axis=1)

# Categorical and numerical features
cat_cols = X.select_dtypes(include='object').columns.tolist()

# Column Transformer for encoding
column_transformer = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='passthrough')

# Create the pipeline
pipe = ImbPipeline(steps=[
    ('convert_totalcharges', ConvertTotalCharges()),
    ('encoder', column_transformer),
    ('resample', SMOTEENN()),
    ('model', RandomForestClassifier(n_estimators=100, criterion='gini',
                                     max_depth=6, min_samples_leaf=8,
                                     random_state=100))
])

In [50]:
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '12'

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Display Pipeline steps 
from sklearn import set_config
set_config(display='diagram')

In [56]:
# Fit the pipeline
pipe.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [57]:
# Predict
y_pred = pipe.predict(X_test)

In [60]:
pipe.named_steps

{'convert_totalcharges': ConvertTotalCharges(),
 'encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehot',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  ['gender', 'Partner', 'Dependents',
                                   'PhoneService', 'MultipleLines',
                                   'InternetService', 'OnlineSecurity',
                                   'OnlineBackup', 'DeviceProtection',
                                   'TechSupport', 'StreamingTV',
                                   'StreamingMovies', 'Contract',
                                   'PaperlessBilling', 'PaymentMethod',
                                   'tenure_group'])]),
 'resample': SMOTEENN(),
 'model': RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)}

In [64]:
print(pipe.named_steps['encoder'].get_feature_names_out())

['onehot__gender_Male' 'onehot__Partner_Yes' 'onehot__Dependents_Yes'
 'onehot__PhoneService_Yes' 'onehot__MultipleLines_No phone service'
 'onehot__MultipleLines_Yes' 'onehot__InternetService_Fiber optic'
 'onehot__InternetService_No' 'onehot__OnlineSecurity_No internet service'
 'onehot__OnlineSecurity_Yes' 'onehot__OnlineBackup_No internet service'
 'onehot__OnlineBackup_Yes' 'onehot__DeviceProtection_No internet service'
 'onehot__DeviceProtection_Yes' 'onehot__TechSupport_No internet service'
 'onehot__TechSupport_Yes' 'onehot__StreamingTV_No internet service'
 'onehot__StreamingTV_Yes' 'onehot__StreamingMovies_No internet service'
 'onehot__StreamingMovies_Yes' 'onehot__Contract_One year'
 'onehot__Contract_Two year' 'onehot__PaperlessBilling_Yes'
 'onehot__PaymentMethod_Credit card (automatic)'
 'onehot__PaymentMethod_Electronic check'
 'onehot__PaymentMethod_Mailed check' 'onehot__tenure_group_13 - 24'
 'onehot__tenure_group_25 - 36' 'onehot__tenure_group_37 - 48'
 'onehot__ten

In [63]:
# Evaluate the model's performance by calculating the accuracy on the resampled test set
model_score = pipe.score(X_test, y_test)
print(f"Model Score: {model_score:.2%}")

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")

Model Score: 76.19%
Accuracy: 76.19%
