In [1]:
import pandas as pd
import numpy as np
import matplotlib.ticker as mticker
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("../data/customer_churn_data.csv", index_col="customerID")
df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Process and clean data
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["PaymentMethod"] = df["PaymentMethod"].str.replace(" (automatic)", "", regex=False)
df.dropna(inplace=True)

In [4]:
# Label Encoding
features = ["Partner", "Dependents", "PhoneService", "Churn", "PaperlessBilling"]

for col in features:
    df[col] = df[col].map({"Yes":1, "No":0}) 

df["gender"] = df["gender"].map({"Female":1, "Male":0})

In [5]:
# One Hot Encoding
features = ["MultipleLines","InternetService","OnlineSecurity","OnlineBackup", "DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaymentMethod"]

df = pd.get_dummies(df, columns=features)

In [6]:
# Scale Down Numerical Features
features = ["tenure","MonthlyCharges","TotalCharges"]

df_numerical = pd.DataFrame(df, columns=features)
df_base = df.drop(columns=features)

scaler = MinMaxScaler()
transformed_numerical = scaler.fit_transform(df_numerical)
scaled_df = pd.DataFrame(transformed_numerical, columns=features, index=df_base.index)

df = pd.concat([scaled_df, df_base], axis=1)

In [7]:
# Create 80/20 Train/Test split
X = df.drop(columns="Churn")
y = df.Churn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

print(f"Train Accuracy: {model.score(X_train,y_train)}")
print(f"Test Accuracy: {accuracy_score(y_pred,y_test)}")

Train Accuracy: 0.8044444444444444
Test Accuracy: 0.8024164889836531


In [8]:
import pickle

model_file = "churn_model.pkl"
with open(model_file, "wb") as file:
    pickle.dump(model, file)