In [1]:
## Importing pandas
import pandas as pd

## Reading csv file 
churn = pd.read_csv('Churn_Data.csv')
churn.head()

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
## Importing statsmodels
import statsmodels.formula.api as smf

## Building the logistics regresion 
logit_md = smf.logit(formula = 'Churn ~ Age + EstimatedSalary + CreditScore + Balance + NumOfProducts', data = churn)

## Extrating model results
results = logit_md.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.460428
         Iterations 6


0,1,2,3
Dep. Variable:,Churn,No. Observations:,9986.0
Model:,Logit,Df Residuals:,9980.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 06 Jul 2021",Pseudo R-squ.:,0.0896
Time:,23:57:08,Log-Likelihood:,-4597.8
converged:,True,LL-Null:,-5050.3
Covariance Type:,nonrobust,LLR p-value:,2.183e-193

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.8613,0.225,-17.153,0.000,-4.302,-3.420
Age,0.0633,0.002,26.544,0.000,0.059,0.068
EstimatedSalary,6.219e-07,4.57e-07,1.361,0.173,-2.74e-07,1.52e-06
CreditScore,-0.0008,0.000,-2.832,0.005,-0.001,-0.000
Balance,4.912e-06,4.49e-07,10.936,0.000,4.03e-06,5.79e-06
NumOfProducts,-0.0355,0.045,-0.782,0.434,-0.125,0.054


In [3]:
## Importing RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

## Defining the input and target
X = churn[['Age', 'EstimatedSalary', 'CreditScore', 'Balance', 'NumOfProducts']]
Y = churn['Churn']

## Building the random forest model
RF_md = RandomForestClassifier().fit(X, Y)

## Predicting the churn of customer with the following
## Age = 50, EstimatedSalary = 100,000, CreditScore = 600
## Balance = 100,000, NumOfProducts = 2
new_customer = [[50, 100000, 600, 100000, 2]]

RF_md.predict_proba(new_customer)

array([[0.81, 0.19]])

In [4]:
from sklearn.preprocessing import MinMaxScaler

## Defining the input and target
X = churn[['Age', 'EstimatedSalary', 'CreditScore', 'Balance', 'NumOfProducts']]
scaler = MinMaxScaler(feature_range = (-1, 1)).fit(X)
X_trans = scaler.transform(X)
X_trans

array([[-0.35135135,  0.01346979,  0.076     , -1.        , -1.        ],
       [-0.37837838,  0.12541748,  0.032     , -0.33193704, -1.        ],
       [-0.35135135,  0.1393087 , -0.392     ,  0.27271435,  0.33333333],
       ...,
       [-0.51351351, -0.57921982,  0.436     , -1.        , -1.        ],
       [-0.35135135, -0.07114189,  0.688     , -0.40154738, -0.33333333],
       [-0.72972973, -0.61817154,  0.768     ,  0.03741555, -1.        ]])

In [5]:
## Importing svm and MinMaxScaler
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

## Defining the input (standardize to -1 to 1 scale)
X = churn[['Age', 'EstimatedSalary', 'CreditScore', 'Balance', 'NumOfProducts']]
scaler = MinMaxScaler(feature_range = (-1, 1)).fit(X)
X = scaler.transform(X)

## Defining the target variable
Y = churn['Churn']

## Building the support vector machine model
SVM_md = SVC(kernel = 'linear', probability = True).fit(X, Y)

## Predicting the churn of customer with the following
## Age = 50, EstimatedSalary = 100,000, CreditScore = 600
## Balance = 100,000, NumOfProducts = 2
new_customer = [[50, 100000, 600, 100000, 2]]
new_customer = scaler.transform(new_customer)

SVM_md.predict_proba(new_customer)

array([[0.76457376, 0.23542624]])

In [7]:
## Importing all the needed libraries
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

## Reading the csv file 
churn = pd.read_csv('Churn_Data.csv')

## Defining the input variables (standardizing to [-1, 1])
X = churn[['Age', 'EstimatedSalary', 'CreditScore', 'Balance', 'NumOfProducts']]
scaler = MinMaxScaler(feature_range = (-1, 1)).fit(X)
X = scaler.transform(X)

## Defining the target variable
Y = churn['Churn']

## split into train (80%) and test (20%) 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = None, stratify = Y)

########################
## Logisti Regression ##
########################

logit_md = LogisticRegression().fit(X_train, Y_train)

## Predicting on test dataset
logit_preds = logit_md.predict(X_test)
logit_preds = logit_preds[:, ]

## Change to 0-1 using cutoff = 0.3
logit_preds = np.where(logit_preds > 0.3, 1, 0)

## Reporting the confusion matrix, accuracy and recall
print('Logistic regression confusion matrix:', confusion_matrix(Y_test, logit_preds))
print('The accuracy of logistic regression is:', accuracy_score(Y_test, logit_preds))
print('The recall of logistic regression is:', recall_score(Y_test, logit_preds))

###################
## Random Forest ##
###################

RF_md = RandomForestClassifier(n_estimators = 500).fit(X_train, Y_train)

## Predicting on test dataset
RF_preds = RF_md.predict_proba(X_test)
RF_preds = RF_preds[:, 1]

## Change to 0-1 using cutoff = 0.3
RF_preds = np.where(RF_preds > 0.3, 1, 0)

## Reporting the confusion matrix, accuracy and recall
print('Random forest confusion matrix:', confusion_matrix(Y_test, RF_preds))
print('The accuracy of random forest is:', accuracy_score(Y_test, RF_preds))
print('The recall of random forest is:', recall_score(Y_test, RF_preds))

#########
## SVM ##
#########

SVM_md = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)

## Predicting on test dataset
SVM_preds = SVM_md.predict_proba(X_test)
SVM_preds = SVM_preds[:, 1]

## Change to 0-1 using cutoff = 0.3
SVM_preds = np.where(SVM_preds > 0.3, 1, 0)

## Reporting the confusion matrix, accuracy and recall
print('SVM confusion matrix:', confusion_matrix(Y_test, SVM_preds))
print('The accuracy of SVM is:', accuracy_score(Y_test, SVM_preds))
print('The recall of SVM is:', recall_score(Y_test, SVM_preds))

Logistic regression confusion matrix: [[1526   65]
 [ 372   35]]
The accuracy of logistic regression is: 0.7812812812812813
The recall of logistic regression is: 0.085995085995086
Random forest confusion matrix: [[1323  268]
 [ 170  237]]
The accuracy of random forest is: 0.7807807807807807
The recall of random forest is: 0.5823095823095823
SVM confusion matrix: [[1591    0]
 [ 407    0]]
The accuracy of SVM is: 0.7962962962962963
The recall of SVM is: 0.0
