In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score

## Reading the data-file 
customer_churn = pd.read_csv('Customer_Churn.csv')
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
## Creating Churn_numb
customer_churn['Churn_numb'] = np.where(customer_churn['Churn'] == 'No', 0, 1)
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_numb
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


In [5]:
## Defining the input and target
X = customer_churn[['SeniorCitizen', 'tenure', 'MonthlyCharges']]
Y = customer_churn['Churn_numb']

customer_churn['Churn_numb'].value_counts() / customer_churn.shape[0]

0    0.73463
1    0.26537
Name: Churn_numb, dtype: float64

In [6]:
## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [9]:
Y_train.value_counts() / Y_train.shape[0]

0    0.734647
1    0.265353
Name: Churn_numb, dtype: float64

In [10]:
Y_test.value_counts() / Y_test.shape[0]

0    0.734564
1    0.265436
Name: Churn_numb, dtype: float64

# Logistic Regression

In [13]:
## Building the model 
logit_md = LogisticRegression().fit(X_train, Y_train)

## Predicting on the test dataset
logi_pred = logit_md.predict_proba(X_test)[:, 1]

## Changing likelihoods into labels
logi_label = np.where(logi_pred < 0.35, 0, 1)

print('The accuracy of the logistic model is:', accuracy_score(Y_test, logi_label))
print('The recall of the logistic model is:', recall_score(Y_test, logi_label))

The accuracy of the logistic model is: 0.7480482611781405
The recall of the logistic model is: 0.6149732620320856


# Random Forest

In [14]:
## Building the model 
RF_md = RandomForestClassifier().fit(X_train, Y_train)

## Predicting on the test dataset
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## Channing likelihoods into labels
RF_label = np.where(RF_pred < 0.35, 0, 1)

print('The accuracy of the RF model is:', accuracy_score(Y_test, RF_label))
print('The recall of the RF model is:', recall_score(Y_test, RF_label))

The accuracy of the RF model is: 0.7409510290986515
The recall of the RF model is: 0.5454545454545454


# Gradient Boosting

In [15]:
## Building the model
GBC_md = GradientBoostingClassifier().fit(X_train, Y_train)

## Predicting on the test dataset
GBC_pred = GBC_md.predict_proba(X_test)[:, 1]

## Changing likelihoods into labels 
GBC_label = np.where(GBC_pred < 0.35, 0, 1)

print('The accuracy of the GBC model is:', accuracy_score(Y_test, GBC_label))
print('The recall of the GB model is:', recall_score(Y_test, GBC_label))

The accuracy of the GBC model is: 0.7693399574166075
The recall of the GB model is: 0.6310160427807486


In [16]:
# Based on my results, I would use the gradient boosting model to predict customer churn.