In [5]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Reading the data-file 
customer_churn = pd.read_csv('Customer_Churn.csv')
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
## Creating Churn_numb
customer_churn['Customer_Churn'] = np.where(customer_churn['Churn'] == 'No', 0, 1)
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Customer_Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


In [7]:
customer_churn['Customer_Churn'].value_counts() / customer_churn.shape[0]

0    0.73463
1    0.26537
Name: Customer_Churn, dtype: float64

In [11]:
## Defining the input and target variables
X = customer_churn[['tenure', 'MonthlyCharges']]
Y = customer_churn['Customer_Churn']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Scaling the data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [9]:
Y_train.value_counts() / Y_train.shape[0]

0    0.734647
1    0.265353
Name: Customer_Churn, dtype: float64

In [10]:
Y_test.value_counts() / Y_test.shape[0]

0    0.734564
1    0.265436
Name: Customer_Churn, dtype: float64

# Logistic Regression

In [12]:
## Building the model
logit_md = LogisticRegression().fit(X_train, Y_train)

## Predicting on test
logit_pred = logit_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
logit_label = np.where(logit_pred < 0.4, 0, 1)

print('The accuracy of the logistic regression is', accuracy_score(Y_test, logit_label))
print('The recall of the logistic regression is', recall_score(Y_test, logit_label))

The accuracy of the logistic regression is 0.7714691270404542
The recall of the logistic regression is 0.5802139037433155


# Random Forest

In [13]:
## Building the model 
RF_md = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
RF_label = np.where(RF_pred < 0.4, 0, 1)

print('The accuracy of the random forest is', accuracy_score(Y_test, RF_label))
print('The recall of the random forest is', recall_score(Y_test, RF_label))

The accuracy of the random forest is 0.7253371185237757
The recall of the random forest is 0.5374331550802139


# Support Vector Machine

In [14]:
## Building the model 
svm_md = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)

## Predicting on test
svm_pred = svm_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
svm_label = np.where(svm_pred < 0.4, 0, 1)

print('The accuracy of the support vector machine is', accuracy_score(Y_test, svm_label))
print('The recall of the support vector machine is', recall_score(Y_test, svm_label))

The accuracy of the support vector machine is 0.7686302342086586
The recall of the support vector machine is 0.56951871657754


In [None]:
## Based on my results, I would use logistic regression to predict customer churn.