In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score
)

In [38]:
df = pd.read_csv("/customer_churn_dataset-training-master.csv")
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


Basic Cleaning

In [39]:
# Drop missing values (very small number)
df = df.dropna()

# Convert CustomerID to string
df["CustomerID"] = df["CustomerID"].astype(str)

df.shape

(440832, 12)

Target Variable

In [40]:
# Ensure Churn has only valid values
df["Churn"] = df["Churn"].astype(int)

# Create binary target
df["Churn_binary"] = df["Churn"]

df["Churn_binary"].value_counts()

Unnamed: 0_level_0,count
Churn_binary,Unnamed: 1_level_1
1,249999
0,190833


Feature Selection

In [41]:
X = df[
    [
        "Age",
        "Tenure",
        "Usage Frequency",
        "Support Calls",
        "Payment Delay",
        "Total Spend",
        "Gender",
        "Subscription Type",
        "Contract Length"
    ]
]

y = df["Churn_binary"]

Encode Categorical Variables

In [42]:
X_encoded = pd.get_dummies(X, drop_first=True)

X_encoded.shape

(440832, 11)

Train–Test Split

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((352665, 11), (88167, 11))

Train Logistic Regression Model

In [44]:
log_model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

log_model.fit(X_train, y_train)

Predictions

In [45]:
y_pred = log_model.predict(X_test)

y_pred[:10]

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 1])

# **Model Evaluation**

Accuracy

In [46]:
accuracy_score(y_test, y_pred)

0.889584538432747

Confusion Matrix

In [47]:
confusion_matrix(y_test, y_pred)

array([[34362,  3805],
       [ 5930, 44070]])

Classification Report

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88     38167
           1       0.92      0.88      0.90     50000

    accuracy                           0.89     88167
   macro avg       0.89      0.89      0.89     88167
weighted avg       0.89      0.89      0.89     88167



## Model Evaluation Summary

- Logistic Regression was used to predict customer churn.
- The model provides a reasonable baseline accuracy.
- Recall for churned customers is particularly important, as it reflects the model’s ability to identify customers at risk.
- This model can help businesses proactively target customers likely to churn.
