## Objective - Predict whether a customer will churn (Yes/No) using historical customer data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, f1_score
from scipy.stats import loguniform


In [2]:
data = pd.read_csv(r'C:\Users\Paras\Desktop\parasonly\ML Projects\data\CustomerChurn.csv')
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   str    
 1   gender            7043 non-null   str    
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   str    
 4   Dependents        7043 non-null   str    
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   str    
 7   MultipleLines     7043 non-null   str    
 8   InternetService   7043 non-null   str    
 9   OnlineSecurity    7043 non-null   str    
 10  OnlineBackup      7043 non-null   str    
 11  DeviceProtection  7043 non-null   str    
 12  TechSupport       7043 non-null   str    
 13  StreamingTV       7043 non-null   str    
 14  StreamingMovies   7043 non-null   str    
 15  Contract          7043 non-null   str    
 16  PaperlessBilling  7043 non-null   str    
 17  Paymen

In [4]:
# converting the TotalCharges from categorical to numerical
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')

# Convert the SeniorCitizen column to categorical
data['SeniorCitizen'] = data['SeniorCitizen'].replace({1: 'Yes', 0: 'No'})

In [5]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   str    
 1   gender            7043 non-null   str    
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   str    
 4   Dependents        7043 non-null   str    
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   str    
 7   MultipleLines     7043 non-null   str    
 8   InternetService   7043 non-null   str    
 9   OnlineSecurity    7043 non-null   str    
 10  OnlineBackup      7043 non-null   str    
 11  DeviceProtection  7043 non-null   str    
 12  TechSupport       7043 non-null   str    
 13  StreamingTV       7043 non-null   str    
 14  StreamingMovies   7043 non-null   str    
 15  Contract          7043 non-null   str    
 16  PaperlessBilling  7043 non-null   str    
 17  Paymen

#### There are 11 missing values in TotalCharges column. We can either drop these rows or fill them with mean/median. Here, we will drop these rows.

In [6]:
data = data.dropna()

In [7]:
data.info()

<class 'pandas.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   str    
 1   gender            7032 non-null   str    
 2   SeniorCitizen     7032 non-null   object 
 3   Partner           7032 non-null   str    
 4   Dependents        7032 non-null   str    
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   str    
 7   MultipleLines     7032 non-null   str    
 8   InternetService   7032 non-null   str    
 9   OnlineSecurity    7032 non-null   str    
 10  OnlineBackup      7032 non-null   str    
 11  DeviceProtection  7032 non-null   str    
 12  TechSupport       7032 non-null   str    
 13  StreamingTV       7032 non-null   str    
 14  StreamingMovies   7032 non-null   str    
 15  Contract          7032 non-null   str    
 16  PaperlessBilling  7032 non-null   str    
 17  PaymentMeth

In [29]:
# Adding a feature to know whether the customer's contract is long term or short term 
data['IsLongTermContract'] = data['Contract'].apply(lambda x: 1 if x == 'Two year' or 'One year' else 0)

In [9]:
# Separating the X_variables and y_variable
X= data.drop(['Churn','customerID'],axis=1)
y = data['Churn'].values

In [10]:
y = y.map({'Yes':1,'No':0})

In [11]:
X.head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,IsLongTermContract
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,1
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,1
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


In [12]:
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [col for col in X.columns if col not in numeric_features]

In [13]:
categorical_features

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'IsLongTermContract']

In [14]:
# Feature Scaling
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])

# Encoding for categorical columns
categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=True))])

preprocessor = ColumnTransformer(transformers=[
    ('num',numeric_transformer,numeric_features),
    ('cat',categorical_transformer,categorical_features)],
    remainder='drop')

In [15]:
# Splitting the data into Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [16]:
len(X_train), len(X_test)

(5274, 1758)

## Model Building (Baseline Models)

### 1. Logistic Regression

In [24]:
lr_pipeline = ImbPipeline(steps = [
    ('preprocessor',preprocessor),
    ('smote',SMOTE(random_state = 42)),
    ('model',LogisticRegression(random_state=42, max_iter=2000))
])

lr_pipeline.fit(X_train, y_train)

y_pred1 = lr_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred1))
print("\nClassification Report:\n", classification_report(y_test, y_pred1))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred1))

Accuracy: 0.7457337883959044

Confusion Matrix:
 [[952 348]
 [ 99 359]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.73      0.81      1300
           1       0.51      0.78      0.62       458

    accuracy                           0.75      1758
   macro avg       0.71      0.76      0.71      1758
weighted avg       0.80      0.75      0.76      1758

ROC AUC Score: 0.7580752435337589


## 2. Random Forest

In [None]:
rf_pipeline = ImbPipeline(steps=[
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)

y_pred2 = rf_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred2))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred2))
print("\nClassification Report:\n", classification_report(y_test, y_pred2))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred2))

Accuracy: 0.7696245733788396
/nConfusion Matrix:
 [[1102  198]
 [ 207  251]]
/nClassification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.84      1300
           1       0.56      0.55      0.55       458

    accuracy                           0.77      1758
   macro avg       0.70      0.70      0.70      1758
weighted avg       0.77      0.77      0.77      1758

ROC AUC Score: 0.6978636210950622


### 3. SVM

In [23]:
svm_pipeline = ImbPipeline(steps=[
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('svc', SVC(probability=True,random_state=42))
])

svm_pipeline.fit(X_train, y_train)

y_pred3 = svm_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred3))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred3))
print("\nClassification Report:\n", classification_report(y_test, y_pred3))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred3))

Accuracy: 0.7616609783845278

Confusion Matrix:
 [[1004  296]
 [ 123  335]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.77      0.83      1300
           1       0.53      0.73      0.62       458

    accuracy                           0.76      1758
   macro avg       0.71      0.75      0.72      1758
weighted avg       0.80      0.76      0.77      1758

ROC AUC Score: 0.7518743701713134


### 4. Xgboost

In [17]:
xgb_pipeline = ImbPipeline(steps=[
    ('preprocess', preprocessor),   
    ('smote', SMOTE(random_state=42)),
    ('model', XGBClassifier(random_state = 42))
])

xgb_pipeline.fit(X_train, y_train)
y_pred4 = xgb_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred4))
print("Classification Report:\n", classification_report(y_test, y_pred4))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred4))

Accuracy: 0.764505119453925
Confusion Matrix:
 [[1098  202]
 [ 212  246]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      1300
           1       0.55      0.54      0.54       458

    accuracy                           0.76      1758
   macro avg       0.69      0.69      0.69      1758
weighted avg       0.76      0.76      0.76      1758

ROC AUC Score: 0.6908666442727578


## Hyperparameter Optimization (RandomizedSearchCV)


### 1. Logistic Regression HPO


In [25]:
lr_param_dist = {           
    'model__penalty': ['l1', 'l2'],
    'model__C': loguniform(1e-4, 10),
    'model__solver': ['liblinear', 'saga'],            
}

lr_search = RandomizedSearchCV(
    lr_pipeline, lr_param_dist,
    n_iter=50, cv=5, scoring='f1', n_jobs=-1,
    random_state=42, verbose=2
)

lr_search.fit(X_train, y_train)

print(f"Best CV F1: {lr_search.best_score_:.4f}")
print(f"Best params: {lr_search.best_params_}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best CV F1: 0.6342
Best params: {'model__C': np.float64(0.09163741808778776), 'model__penalty': 'l1', 'model__solver': 'liblinear'}




### 2. Random Forest HPO


In [20]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_depth = [int(x) for x in np.linspace(start=10, stop=110, num=11)]
min_samples_leaf = [1, 2, 4, 10, 20, 50, 100]
min_samples_split = [2, 3, 4, 5, 8, 10, 20, 50, 100, 200]
max_features = ['sqrt', 'log2', 0.3, 0.5]

rf_param_dist = {
    'model__n_estimators': n_estimators,
    'model__max_depth': max_depth,
    'model__min_samples_split': min_samples_split,
    'model__min_samples_leaf': min_samples_leaf,
    'model__max_features': max_features
}

rf_search = RandomizedSearchCV(
    rf_pipeline, rf_param_dist, cv=5, n_iter=50, scoring='f1', n_jobs=-1,
    random_state=42, verbose=2)

rf_search.fit(X_train, y_train)

print(f"Best CV F1: {rf_search.best_score_:.4f}")
print(f"Best params: {rf_search.best_params_}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best CV F1: 0.6394
Best params: {'model__n_estimators': 400, 'model__min_samples_split': 4, 'model__min_samples_leaf': 20, 'model__max_features': 'sqrt', 'model__max_depth': 60}


### 3. SVM HPO


In [26]:
svm_param_dist = {
    'svc__C': loguniform(1e-2, 100),
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__gamma': ['scale', 'auto']
}

svm_search = RandomizedSearchCV(
    svm_pipeline, svm_param_dist,
    n_iter=50, cv=5, scoring='f1', n_jobs=-1,
    random_state=42, verbose=2
)

svm_search.fit(X_train, y_train)

print(f"Best CV F1: {svm_search.best_score_:.4f}")
print(f"Best params: {svm_search.best_params_}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best CV F1: 0.6312
Best params: {'svc__C': np.float64(2.950706670790534), 'svc__gamma': 'auto', 'svc__kernel': 'rbf'}


### 4. XGBoost HPO


In [21]:
xgb_param_dist = {
    'model__n_estimators': [int(x) for x in np.linspace(start=50, stop=500, num=10)],
    'model__max_depth': [int(x) for x in np.linspace(start=3, stop=10, num=7)],
    'model__learning_rate': loguniform(0.1, 0.3)
}

xgb_search = RandomizedSearchCV(
    xgb_pipeline, xgb_param_dist,
    n_iter=50, cv=5, scoring='f1', n_jobs=-1,
    random_state=42, verbose=2
)

xgb_search.fit(X_train, y_train)

print(f"Best CV F1: {xgb_search.best_score_:.4f}")
print(f"Best params: {xgb_search.best_params_}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best CV F1: 0.6451
Best params: {'model__learning_rate': np.float64(0.10077933409688876), 'model__max_depth': 3, 'model__n_estimators': 50}


## Testing All Best Models and Comparison


In [27]:
# Test all best models

models = {
    'Logistic Regression': lr_search.best_estimator_,
    'Random Forest': rf_search.best_estimator_,
    'SVM': svm_search.best_estimator_,
    'XGBoost': xgb_search.best_estimator_
}

results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    results[name] = {
        'F1': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_proba),
        'Accuracy': accuracy_score(y_test, y_pred)
    }
    
    print(f"\n{name} Test Results:")
    print('Confusion Matrix: ')
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Summary table
results_df = pd.DataFrame(results).T
print("\n=== FINAL COMPARISON ===")
print(results_df.round(4))


Logistic Regression Test Results:
Confusion Matrix: 
[[952 348]
 [ 99 359]]
              precision    recall  f1-score   support

           0       0.91      0.73      0.81      1300
           1       0.51      0.78      0.62       458

    accuracy                           0.75      1758
   macro avg       0.71      0.76      0.71      1758
weighted avg       0.80      0.75      0.76      1758


Random Forest Test Results:
Confusion Matrix: 
[[1003  297]
 [ 124  334]]
              precision    recall  f1-score   support

           0       0.89      0.77      0.83      1300
           1       0.53      0.73      0.61       458

    accuracy                           0.76      1758
   macro avg       0.71      0.75      0.72      1758
weighted avg       0.80      0.76      0.77      1758


SVM Test Results:
Confusion Matrix: 
[[968 332]
 [113 345]]
              precision    recall  f1-score   support

           0       0.90      0.74      0.81      1300
           1       0.51 

#### Multiple classifiers were evaluated using F1-score and ROC-AUC due to class imbalance. Results show similar performance across models. Logistic Regression achieved the highest recall for churners, while Random Forest provided better precision. Model selection depends on retention cost strategy.

In [28]:
import joblib 

#save the model
joblib.dump(xgb_search.best_estimator_, 'best_model.pkl')

['best_model.pkl']

#### XGBoost was selected as the final churn prediction model due to its highest ROC-AUC score, stable probability ranking, and ability to capture non-linear feature interactions. Although Logistic Regression achieved slightly higher recall, XGBoost provides greater flexibility through threshold tuning, enabling cost-sensitive churn intervention strategies. This makes XGBoost more suitable for production deployment where business trade-offs vary over time.