# Logistic Regression 

In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [160]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [161]:
# Load Data
df = pd.read_csv('customer_churn_dataset.csv')
df.head()

Unnamed: 0,CustomerID,Tenure,MonthlyCharges,TotalCharges,ContractType,TechSupport,Complaints,Churn
0,1,39,74.42,4265.19,One Year,Yes,No,No
1,2,52,56.04,2302.31,Monthly,No,No,Yes
2,3,29,21.06,567.51,Monthly,Yes,Yes,No
3,4,15,95.38,1917.01,Two Year,No,Yes,No
4,5,43,65.06,3377.32,One Year,No,No,No


In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CustomerID      58 non-null     int64  
 1   Tenure          58 non-null     int64  
 2   MonthlyCharges  58 non-null     float64
 3   TotalCharges    58 non-null     float64
 4   ContractType    58 non-null     object 
 5   TechSupport     58 non-null     object 
 6   Complaints      58 non-null     object 
 7   Churn           58 non-null     object 
dtypes: float64(2), int64(2), object(4)
memory usage: 3.8+ KB


In [163]:
LE = LabelEncoder()
df['Complaints'] = LE.fit_transform(df['Complaints'])
df['TechSupport'] = LE.fit_transform(df['TechSupport'])                                   

In [164]:
df['ContractType'].value_counts()

ContractType
One Year    20
Two Year    20
Monthly     18
Name: count, dtype: int64

In [165]:
df = pd.get_dummies(df, columns=['ContractType'], drop_first=True)

In [166]:
df.head()

Unnamed: 0,CustomerID,Tenure,MonthlyCharges,TotalCharges,TechSupport,Complaints,Churn,ContractType_One Year,ContractType_Two Year
0,1,39,74.42,4265.19,1,0,No,True,False
1,2,52,56.04,2302.31,0,0,Yes,False,False
2,3,29,21.06,567.51,1,1,No,False,False
3,4,15,95.38,1917.01,0,1,No,False,True
4,5,43,65.06,3377.32,0,0,No,True,False


In [167]:
# Split dataset into input and output variables
x = df.drop(columns=['CustomerID','Churn'])
y = df['Churn'].apply(lambda x: 1 if x=='Yes' else 0)

In [168]:
x.head()

Unnamed: 0,Tenure,MonthlyCharges,TotalCharges,TechSupport,Complaints,ContractType_One Year,ContractType_Two Year
0,39,74.42,4265.19,1,0,True,False
1,52,56.04,2302.31,0,0,False,False
2,29,21.06,567.51,1,1,False,False
3,15,95.38,1917.01,0,1,False,True
4,43,65.06,3377.32,0,0,True,False


In [169]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Churn, dtype: int64

In [170]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 3)

In [171]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [172]:
x_train[0]

array([ 0.6573479 ,  0.20206494,  0.76793909, -0.83887049, -0.91651514,
        1.51185789, -0.73029674])

In [173]:
x_test[0]

array([ 1.06394103,  0.44857745, -0.5790083 ,  1.        , -0.57735027,
        1.        , -0.70710678])

In [174]:
logreg = LogisticRegression()
logreg.fit(x_train,y_train)

In [175]:
logreg_y_pred = logreg.predict(x_test)

In [176]:
logreg.predict_proba(x_test)[:,1]

array([0.32070822, 0.24427838, 0.74971174, 0.1228881 , 0.27814079,
       0.74971174, 0.20373556, 0.20022333, 0.20377761, 0.43599528,
       0.41056289, 0.20022333])

In [177]:
x_test.shape

(12, 7)

In [178]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [179]:
print('Classification Report: \n' ) 
print(classification_report(y_test, logreg_y_pred))
print('Accuracy Score:', accuracy_score(y_test,logreg_y_pred))
print('Precision Score:',precision_score(y_test,logreg_y_pred))
print('Recall Score:',recall_score(y_test,logreg_y_pred))
print('F1 Score:', f1_score(y_test, logreg_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,logreg.predict_proba(x_test)[:,1]))
      

Classification Report: 

              precision    recall  f1-score   support

           0       0.80      1.00      0.89         8
           1       1.00      0.50      0.67         4

    accuracy                           0.83        12
   macro avg       0.90      0.75      0.78        12
weighted avg       0.87      0.83      0.81        12

Accuracy Score: 0.8333333333333334
Precision Score: 1.0
Recall Score: 0.5
F1 Score: 0.6666666666666666
ROC and AUC Score: 0.5625


# Decision Tree Algorithm

In [180]:
from sklearn.tree import DecisionTreeClassifier

In [181]:
DTC = DecisionTreeClassifier()
DTC.fit(x_train,y_train)
DTC_y_pred = DTC.predict(x_test)



In [182]:
print('Decision Tree Report: \n' ) 
print(classification_report(y_test, DTC_y_pred))
print('Accuracy Score:', accuracy_score(y_test,DTC_y_pred))
print('Precision Score:',precision_score(y_test,DTC_y_pred))
print('Recall Score:',recall_score(y_test,DTC_y_pred))
print('F1 Score:', f1_score(y_test, DTC_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,DTC.predict_proba(x_test)[:,1]))
      

Decision Tree Report: 

              precision    recall  f1-score   support

           0       0.60      0.75      0.67         8
           1       0.00      0.00      0.00         4

    accuracy                           0.50        12
   macro avg       0.30      0.38      0.33        12
weighted avg       0.40      0.50      0.44        12

Accuracy Score: 0.5
Precision Score: 0.0
Recall Score: 0.0
F1 Score: 0.0
ROC and AUC Score: 0.375


# Random Forest

In [183]:
from sklearn.ensemble import RandomForestClassifier

In [188]:
RF = RandomForestClassifier(n_estimators=10, criterion = 'gini', random_state=5)
RF.fit(x_train, y_train)
RF_y_pred = RF.predict(x_test)

print('Random Forest Report: \n' ) 
print(classification_report(y_test, RF_y_pred))
print('Accuracy Score:', accuracy_score(y_test,RF_y_pred))
print('Precision Score:',precision_score(y_test,RF_y_pred))
print('Recall Score:',recall_score(y_test,RF_y_pred))
print('F1 Score:', f1_score(y_test, RF_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,RF.predict_proba(x_test)[:,1]))
      

Random Forest Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.50      0.50      0.50         4

    accuracy                           0.67        12
   macro avg       0.62      0.62      0.62        12
weighted avg       0.67      0.67      0.67        12

Accuracy Score: 0.6666666666666666
Precision Score: 0.5
Recall Score: 0.5
F1 Score: 0.5
ROC and AUC Score: 0.59375


In [189]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB, BernoulliNB, MultinomialNB

In [191]:
GNB = GaussianNB()
GNB.fit(x_train, y_train)
GNB_y_pred = RF.predict(x_test)

print('Random Forest Report: \n' ) 
print(classification_report(y_test, GNB_y_pred))
print('Accuracy Score:', accuracy_score(y_test,GNB_y_pred))
print('Precision Score:',precision_score(y_test,GNB_y_pred))
print('Recall Score:',recall_score(y_test,GNB_y_pred))
print('F1 Score:', f1_score(y_test, GNB_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,GNB.predict_proba(x_test)[:,1]))
      

Random Forest Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.50      0.50      0.50         4

    accuracy                           0.67        12
   macro avg       0.62      0.62      0.62        12
weighted avg       0.67      0.67      0.67        12

Accuracy Score: 0.6666666666666666
Precision Score: 0.5
Recall Score: 0.5
F1 Score: 0.5
ROC and AUC Score: 0.5


In [192]:
BNB = BernoulliNB()
BNB.fit(x_train, y_train)
BNB_y_pred = RF.predict(x_test)

print('Random Forest Report: \n' ) 
print(classification_report(y_test, BNB_y_pred))
print('Accuracy Score:', accuracy_score(y_test,BNB_y_pred))
print('Precision Score:',precision_score(y_test,BNB_y_pred))
print('Recall Score:',recall_score(y_test,BNB_y_pred))
print('F1 Score:', f1_score(y_test, BNB_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,BNB.predict_proba(x_test)[:,1]))
      

Random Forest Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.50      0.50      0.50         4

    accuracy                           0.67        12
   macro avg       0.62      0.62      0.62        12
weighted avg       0.67      0.67      0.67        12

Accuracy Score: 0.6666666666666666
Precision Score: 0.5
Recall Score: 0.5
F1 Score: 0.5
ROC and AUC Score: 0.6875


# K-Nearest Neighbours

In [197]:
from sklearn.neighbors import KNeighborsClassifier

In [202]:
KNN = KNeighborsClassifier(n_neighbors=3, p=2)
KNN.fit(x_train, y_train)
KNN_y_pred = RF.predict(x_test)

print('K-Neive Neighbours Report: \n' ) 
print(classification_report(y_test, KNN_y_pred))
print('Accuracy Score:', accuracy_score(y_test,KNN_y_pred))
print('Precision Score:',precision_score(y_test,KNN_y_pred))
print('Recall Score:',recall_score(y_test,KNN_y_pred))
print('F1 Score:', f1_score(y_test, KNN_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,KNN.predict_proba(x_test)[:,1]))
      

K-Neive Neighbours Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.50      0.50      0.50         4

    accuracy                           0.67        12
   macro avg       0.62      0.62      0.62        12
weighted avg       0.67      0.67      0.67        12

Accuracy Score: 0.6666666666666666
Precision Score: 0.5
Recall Score: 0.5
F1 Score: 0.5
ROC and AUC Score: 0.71875


# Support Vector Machine Algorithm (SVM)

In [212]:
from sklearn.svm import SVC

In [213]:
SVC = SVC(kernel = 'linear', probability = True, random_state=5)
SVC.fit(x_train, y_train)
SVC_y_pred = RF.predict(x_test)

print('SVC Report: \n' ) 
print(classification_report(y_test, SVC_y_pred))
print('Accuracy Score:', accuracy_score(y_test,SVC_y_pred))
print('Precision Score:',precision_score(y_test,SVC_y_pred))
print('Recall Score:',recall_score(y_test,SVC_y_pred))
print('F1 Score:', f1_score(y_test, SVC_y_pred))
print('ROC and AUC Score:',roc_auc_score(y_test,SVC.predict_proba(x_test)[:,1]))
      

SVC Report: 

              precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.50      0.50      0.50         4

    accuracy                           0.67        12
   macro avg       0.62      0.62      0.62        12
weighted avg       0.67      0.67      0.67        12

Accuracy Score: 0.6666666666666666
Precision Score: 0.5
Recall Score: 0.5
F1 Score: 0.5
ROC and AUC Score: 0.8125


# Hyper Parameter Tuning

In [235]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [236]:
np.logspace(-3, 3, 7)

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [237]:
param_dist_log_reg = {
    'C':np.logspace(-3,3,7), 
    'penalty': ['l1','l2'],
'solver': ['liblinear','saga']
}

In [238]:
random_search_log_reg = RandomizedSearchCV(logreg, param_dist_log_reg, n_iter=10, cv = 5, scoring ='accuracy')

In [239]:
random_search_log_reg.fit(x_train, y_train)

In [248]:
random_search_log_reg.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'deprecated',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(),
 'n_iter': 10,
 'n_jobs': None,
 'param_distributions': {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'penalty': ['l1', 'l2'],
  'solver': ['liblinear', 'saga']},
 'pre_dispatch': '2*n_jobs',
 'random_state': None,
 'refit': True,
 'return_train_score': False,
 'scoring': 'accuracy',
 'verbose': 0}

In [244]:
grid_search_log_reg = GridSearchCV(logreg, param_dist_log_reg, cv = 5, scoring ='accuracy')

In [245]:
grid_search_log_reg.fit(x_train, y_train)

In [249]:
grid_search_log_reg.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 100,
 'estimator__multi_class': 'deprecated',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(),
 'n_jobs': None,
 'param_grid': {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'penalty': ['l1', 'l2'],
  'solver': ['liblinear', 'saga']},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': 'accuracy',
 'verbose': 0}