# Customer Churn Prediction

## Data Preparation and Feature Engineering

Import required libraries. 

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

Read and inspect data.

In [156]:
data = pd.read_csv("Telco-Customer-Churn.csv")
print(data.head())
print(data.columns)
print(data.shape)

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

Find categorical features where there are 2 options for the label encoder. 

In [157]:
print(data["gender"].unique())
print(data["Partner"].unique())
print(data["Dependents"].unique())
print(data["PhoneService"].unique())
print(data["PaperlessBilling"].unique())

print(data["SeniorCitizen"].unique())

['Female' 'Male']
['Yes' 'No']
['No' 'Yes']
['No' 'Yes']
['Yes' 'No']
[0 1]


TotalCharges feature is an object that has 11 ' ' (empty string values) in the dataset, therefore those 11 columns are omitted.

In [158]:
print(data["TotalCharges"].dtype)

object


In [159]:
print((data["TotalCharges"] == ' ').sum())
print(data[data["TotalCharges"] == " "])

11
      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
488   4472-LVYGI  Female              0     Yes        Yes       0   
753   3115-CZMZD    Male              0      No        Yes       0   
936   5709-LVOEQ  Female              0     Yes        Yes       0   
1082  4367-NUYAO    Male              0     Yes        Yes       0   
1340  1371-DWPAZ  Female              0     Yes        Yes       0   
3331  7644-OMVMY    Male              0     Yes        Yes       0   
3826  3213-VVOLG    Male              0     Yes        Yes       0   
4380  2520-SGTTA  Female              0     Yes        Yes       0   
5218  2923-ARZLG    Male              0     Yes        Yes       0   
6670  4075-WKNIU  Female              0     Yes        Yes       0   
6754  2775-SEFEE    Male              0      No        Yes       0   

     PhoneService     MultipleLines InternetService       OnlineSecurity  ...  \
488            No  No phone service             DSL                  Yes  .

In [160]:
data = data.drop(data[data["TotalCharges"] == " "].index)

print((data["TotalCharges"] == " ").sum())
print(data.shape)

0
(7032, 21)


Separate X and y from the data, drop customerID since it is not needed in the model. 

In [161]:
X = data.drop("Churn", axis=1)
y = data["Churn"]

X = X.drop("customerID", axis=1)

For the categorical variables that have 2 options, map them to 0 and 1's by the label encoder.

In [162]:
le = LabelEncoder()

le_cols = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]

for col in le_cols:
    X[col] = le.fit_transform(X[col])

print(X.head())

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No          No              No  Month-to-month   
1             

For the categorical features that have more than 2 options, use the one hot encoder to make those features processible.

In [163]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False).set_output(transform="pandas")
 
ohe_cols = ["MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup",
           "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaymentMethod"]

ohe_transformed = ohe.fit_transform(X[ohe_cols])

print(ohe_transformed.columns)

Index(['MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_No internet service', 'OnlineBackup_Yes',
       'DeviceProtection_No', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No',
       'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


In [164]:
X_for_model = pd.concat([X, ohe_transformed], axis=1).drop(columns = ohe_cols)

print(X_for_model.head())
print(X_for_model.shape)
print(X_for_model.columns)

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

   PaperlessBilling  MonthlyCharges TotalCharges  MultipleLines_No  ...  \
0                 1           29.85        29.85               0.0  ...   
1                 0           56.95       1889.5               1.0  ...   
2                 1           53.85       108.15               1.0  ...   
3                 0           42.30      1840.75               0.0  ...   
4                 1           70.70       151.65               1.0  ...   

   StreamingMovies_No  StreamingMovies_No internet service  \
0                 1.0                                  0.0  

In [165]:
y_for_model = le.fit_transform(y)

print(y_for_model)
print(y_for_model.size)

[0 0 1 ... 0 1 0]
7032


Split the data as train and standardize the data to ensure the convergence for the model.

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X_for_model, y_for_model, test_size=0.2, random_state=42)

print(X_for_model.shape, y_for_model.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7032, 40) (7032,) (5625, 40) (1407, 40) (5625,) (1407,)


In [167]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression Model 

In [168]:
lr_model = LogisticRegression()

lr_model.fit(X_train_scaled, y_train)

In [169]:
accuracy = lr_model.score(X_test_scaled, y_test)

print(f"Test set accuracy: {accuracy}")

Test set accuracy: 0.7867803837953091


In [170]:
X_combined = np.vstack([X_train_scaled, X_test_scaled])
y_combined = np.hstack([y_train, y_test])

print(X_combined.shape)
print(y_combined.shape)

(7032, 40)
(7032,)


Check the cross validation scores for the default model.

In [171]:
cv_scores = cross_val_score(lr_model, X_combined, y_combined, cv=5, scoring="accuracy")
print(f"Cross Validation scores: {cv_scores}")
print(f"Mean Cross Validation Accuracy: {cv_scores.mean():.5f}")

Cross Validation scores: [0.81449893 0.78891258 0.79943101 0.82432432 0.78662873]
Mean Cross Validation Accuracy: 0.80276


Use a grid search to find the best configurations for the Logistic Regression model.

In [172]:
parameter_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'saga'],  
    'max_iter': [100, 200, 500]     
} 

grid_search = GridSearchCV(lr_model, parameter_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_combined, y_combined)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.5f}")

Best Parameters: {'C': 100, 'max_iter': 200, 'solver': 'saga'}
Best Cross-Validation Score: 0.80447


Test the best model that is determined from the grid search in test data.

In [173]:
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

bm_test_accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Test Set Accuracy: {bm_test_accuracy:.5f}")

Best Model Test Set Accuracy: 0.79033


Evaluate and report the results from the Logistic Regression model, and save the best model found. 

In [174]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.63      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [175]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[919 114]
 [181 193]]


In [176]:
joblib.dump(best_model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']