In [1]:
# 1. True
# 2. False

In [3]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score

# Reading the data
churn = pd.read_csv('Customer_Churn.csv')
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
churn['Churn_numb'] = churn['Churn'].map({'No': 0, 'Yes': 1})
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_numb
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


# Defining input and target

In [5]:
X = churn[['SeniorCitizen', 'tenure', 'MonthlyCharges']]
Y = churn['Churn_numb']

In [7]:
Y.value_counts(normalize = True)

0    0.73463
1    0.26537
Name: Churn_numb, dtype: float64

# Splitting the data

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

# Logistic Regression

In [10]:
# Building the model
logit_md = LogisticRegression().fit(X_train, Y_train)

# Predicting on the test dataset 
logit_pred = logit_md.predict_proba(X_test)[:, 1]

# Changing likelihood to labels
logit_label = np.where(logit_pred < 0.35, 0, 1)

print(f"The LR accuracy is {accuracy_score(Y_test, logit_label)}")
print(f"The LR recall is {recall_score(Y_test, logit_label)}")

The LR accuracy is 0.7757274662881476
The LR recall is 0.6149732620320856


# Random Forest

In [11]:
# Building the model
RF_md = RandomForestClassifier().fit(X_train, Y_train)

# Predicting on the test dataset 
RF_pred = RF_md.predict_proba(X_test)[:, 1]

# Changing likelihood to labels
RF_label = np.where(RF_pred < 0.35, 0, 1)

print(f"The RF accuracy is {accuracy_score(Y_test, RF_label)}")
print(f"The RF recall is {recall_score(Y_test, RF_label)}")

The RF accuracy is 0.7430801987224982
The RF recall is 0.5721925133689839


# Gradient Boosting

In [13]:
# Building the model
GB_md = GradientBoostingClassifier().fit(X_train, Y_train)

# Predicting on the test dataset 
GB_pred = GB_md.predict_proba(X_test)[:, 1]

# Changing likelihood to labels
GB_label = np.where(GB_pred < 0.35, 0, 1)

print(f"The GB accuracy is {accuracy_score(Y_test, GB_label)}")
print(f"The GB recall is {recall_score(Y_test, GB_label)}")

The GB accuracy is 0.7707594038325053
The GB recall is 0.6122994652406417


In [14]:
# I would use the logistic regression model to predict churn because it has higher accuracy and recall.