In [1]:
!pip install numpy pandas matplotlib scikit-learn



In [2]:
!python -m pip install --upgrade pip



In [3]:
# Celll 1 - Import libraies and making result folder
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score)
import joblib

os.makedirs("results", exist_ok=True)

In [7]:
# Cell 2 - Loding Data Set and performing quice EDA
path = r'C:\Users\Love\Desktop\my_churn_analysis\churn-bigml-80.csv'
df = pd.read_csv(path)

print("Shape:", df.shape)
display(df.head())
print("\nColumns and dtypes:\n", df.dtypes)
print("\nMissing values per column:\n", df.isnull().sum())
print("\nTarget distribution (Churn):\n", df['Churn'].value_counts())

Shape: (2666, 20)


Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False



Columns and dtypes:
 State                      object
Account length              int64
Area code                   int64
International plan         object
Voice mail plan            object
Number vmail messages       int64
Total day minutes         float64
Total day calls             int64
Total day charge          float64
Total eve minutes         float64
Total eve calls             int64
Total eve charge          float64
Total night minutes       float64
Total night calls           int64
Total night charge        float64
Total intl minutes        float64
Total intl calls            int64
Total intl charge         float64
Customer service calls      int64
Churn                        bool
dtype: object

Missing values per column:
 State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Tot

In [5]:
# Cell 3 - Preprocessing 
df2 = df.copy()

if 'State' in df2.columns:
    df2 = df2.drop(columns=['State'])

df2['Internaational plan'] = df2['International plan'].map({'No':0, 'Yes':1})
df2['Voice mail plan'] = df2['Voice mail plan']. map({'No':0, 'Yes':1})

df2['Churn'] = df2['Churn'].map({False:0, True:1})

df2 = pd.get_dummies(df2, columns=['Area code'], prefix='Area', drop_first=True)

print("processed shape:", df2.shape)
display(df2.head())

processed shape: (2666, 21)


Unnamed: 0,Account length,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,...,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn,Internaational plan,Area_415,Area_510
0,128,No,1,25,265.1,110,45.07,197.4,99,16.78,...,91,11.01,10.0,3,2.7,1,0,0,True,False
1,107,No,1,26,161.6,123,27.47,195.5,103,16.62,...,103,11.45,13.7,3,3.7,1,0,0,True,False
2,137,No,0,0,243.4,114,41.38,121.2,110,10.3,...,104,7.32,12.2,5,3.29,0,0,0,True,False
3,84,Yes,0,0,299.4,71,50.9,61.9,88,5.26,...,89,8.86,6.6,7,1.78,2,0,1,False,False
4,75,Yes,0,0,166.7,113,28.34,148.3,122,12.61,...,121,8.41,10.1,3,2.73,3,0,1,True,False


In [8]:
# Cell 4 - rain-test split and scaling 
X = df.drop(columns=['Churn'])
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (2132, 19) Test: (534, 19)


In [21]:
# Cell 5 - Train 3 models logistic, Decision Tree, RandomForest for lightweight run
models = {
    "LogisticRegression": LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=50, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    probs = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, "predict_proba") else None

    results[name] = {
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds, zero_division=0),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds),
        "roc_auc": roc_auc_score(y_test, probs) if probs is not None else None,
        "confusion_matrix": confusion_matrix(y_test, preds),
        "report": classification_report(y_test, preds)
    }

for k, v in results.items():
    print(f"\n== {k} ==")
    print("Accuracy:", v['accuracy'])
    print("Precision:", v['precision'])
    print("Recall:", v['recall'])
    print("F1:", v['f1'])
    print("ROC AUC:", v['roc_auc'])
    print("Confusion matrix:\n", v['confusion_matrix'])


== LogisticRegression ==
Accuracy: 0.6910112359550562
Precision: 0.2648648648648649
Recall: 0.6282051282051282
F1: 0.3726235741444867
ROC AUC: 0.6937134502923976
Confusion matrix:
 [[320 136]
 [ 29  49]]

== DecisionTree ==
Accuracy: 0.850187265917603
Precision: 0.48863636363636365
Recall: 0.5512820512820513
F1: 0.5180722891566265
ROC AUC: 0.7262989203778677
Confusion matrix:
 [[411  45]
 [ 35  43]]

== RandomForest ==
Accuracy: 0.9250936329588015
Precision: 1.0
Recall: 0.48717948717948717
F1: 0.6551724137931034
ROC AUC: 0.8387033288349077
Confusion matrix:
 [[456   0]
 [ 40  38]]


In [25]:
# Cell 6 - Save confusion matrix figures matplotlib only and model
import matplotlib.pyplot as plt
os.makedirs("results/figures", exist_ok=True)

for name, v in results.items():
    cm = v['confusion_matrix']
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(f'Confusion Matrix -{name}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    for (i, j), val in np.ndenumerate(cm):
        plt.text(j, i, int(val), ha='center', va='center')
    plt.savefig(f"results/figures/confudion_{name}.png")
    plt.close()

joblib.dump(scaler, "results/scaler.pk1")
joblib.dump(models['RandomForest'], "results/random_forest_50.pk1")

print("Saved figures & model to results/ folder.")

Saved figures & model to results/ folder.
