In [41]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score,precision_score,recall_score,roc_auc_score,classification_report,roc_curve)
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("customer-churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()
df["Churn"].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Churn
No     5174
Yes    1869
Name: count, dtype: int64

In [5]:
df["Partner"].unique()


array(['Yes', 'No'], dtype=object)

In [6]:
df["gender"].unique()


array(['Female', 'Male'], dtype=object)

In [7]:
df["SeniorCitizen"].unique()


array([0, 1])

In [8]:
df["Dependents"].unique()


array(['No', 'Yes'], dtype=object)

In [9]:
df["tenure"].describe()


count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64

In [10]:
df["MonthlyCharges"].describe()


count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: MonthlyCharges, dtype: float64

In [11]:
df["TotalCharges"].dtype #should be numeric but its an object?



dtype('O')

In [12]:
(df["TotalCharges"] == " ").sum() # 11 blank strings


np.int64(11)

In [13]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce") # TotalCharges has empty strings so they will be converted to NaN

In [14]:
df.dropna(inplace=True) #drop rows with missing values

In [15]:
df.drop("customerID", axis=1, inplace=True) #customerid aint helpful

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 
 17  

#### target encoding

In [17]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0}) 

In [18]:
df["Churn"].head(10) 

0    0
1    0
2    1
3    0
4    1
5    1
6    0
7    0
8    1
9    0
Name: Churn, dtype: int64

#### categorical encoding

In [19]:
df = pd.get_dummies(df, drop_first=True) 

#### data splitting

In [20]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


#### scaling

In [21]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
def helper(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    # Some models need predict_proba, SVM needs special handling
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = model.decision_function(X_test)
    
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_prob)
    }


#### knn

In [30]:
knn = KNeighborsClassifier(n_neighbors=5)
knn_scores = helper(knn, X_train_scaled, X_test_scaled, y_train, y_test)
knn_scores


{'Accuracy': 0.7533759772565742,
 'Precision': 0.5361930294906166,
 'Recall': 0.5347593582887701,
 'ROC-AUC': 0.766680816478664}

#### logistic regression

In [31]:
log_reg = LogisticRegression(max_iter=1000)
log_scores = helper(log_reg, X_train_scaled, X_test_scaled, y_train, y_test)
log_scores


{'Accuracy': 0.8038379530916845,
 'Precision': 0.6475903614457831,
 'Recall': 0.5748663101604278,
 'ROC-AUC': 0.8356727976766699}

#### svm linear

In [32]:
svm_linear = SVC(kernel="linear", probability=True)
svm_linear_scores = helper( svm_linear, X_train_scaled, X_test_scaled, y_train, y_test)
svm_linear_scores


{'Accuracy': 0.798862828713575,
 'Precision': 0.6408668730650154,
 'Recall': 0.553475935828877,
 'ROC-AUC': 0.8272773863571653}

#### svm rbf

In [36]:
svm_rbf = SVC(kernel="rbf", probability=True)
svm_rbf_scores = helper(svm_rbf, X_train_scaled, X_test_scaled, y_train, y_test)
svm_rbf_scores


{'Accuracy': 0.7867803837953091,
 'Precision': 0.6258503401360545,
 'Recall': 0.4919786096256685,
 'ROC-AUC': 0.7908536995718819}

#### decision trees

In [38]:
dt = DecisionTreeClassifier(random_state=42)
dt_scores = helper(dt, X_train, X_test, y_train, y_test)
dt_scores

{'Accuracy': 0.7185501066098081,
 'Precision': 0.4701086956521739,
 'Recall': 0.4625668449197861,
 'ROC-AUC': 0.636638004669438}

#### naive bayes

In [39]:
nb = GaussianNB()
nb_scores = helper(nb, X_train_scaled, X_test_scaled, y_train, y_test)
nb_scores


{'Accuracy': 0.644633972992182,
 'Precision': 0.4183937823834197,
 'Recall': 0.8636363636363636,
 'ROC-AUC': 0.8101824290395555}

#### random forest

In [42]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = helper(rf, X_train, X_test, y_train, y_test)
rf_scores

{'Accuracy': 0.7896233120113717,
 'Precision': 0.6258064516129033,
 'Recall': 0.5187165775401069,
 'ROC-AUC': 0.8164903116927489}

#### gradient boosting

In [43]:
gb = GradientBoostingClassifier(random_state=42)
gb_scores = helper(gb, X_train, X_test, y_train, y_test)
gb_scores


{'Accuracy': 0.7953091684434968,
 'Precision': 0.6378205128205128,
 'Recall': 0.5320855614973262,
 'ROC-AUC': 0.8406619005958451}

#### comparing all

In [44]:
results = pd.DataFrame.from_dict({
    "KNN": knn_scores,
    "Logistic Regression": log_scores,
    "SVM Linear": svm_linear_scores,
    "SVM RBF": svm_rbf_scores,
    "Decision Tree": dt_scores,
    "Naive Bayes": nb_scores,
    "Random Forest": rf_scores,
    "Gradient Boosting": gb_scores
}, orient="index")

results.sort_values("ROC-AUC", ascending=False)


Unnamed: 0,Accuracy,Precision,Recall,ROC-AUC
Gradient Boosting,0.795309,0.637821,0.532086,0.840662
Logistic Regression,0.803838,0.64759,0.574866,0.835673
SVM Linear,0.798863,0.640867,0.553476,0.827277
Random Forest,0.789623,0.625806,0.518717,0.81649
Naive Bayes,0.644634,0.418394,0.863636,0.810182
SVM RBF,0.78678,0.62585,0.491979,0.790854
KNN,0.753376,0.536193,0.534759,0.766681
Decision Tree,0.71855,0.470109,0.462567,0.636638
