In [184]:
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
df= pd.read_csv('telco_churn.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,0,1,0,0,1,0,0,2,...,2,29.85,29.85,0,True,False,False,False,False,False
1,1,1,0,0,0,1,0,0,2,0,...,3,56.95,1889.5,0,False,False,True,False,False,False
2,2,1,0,0,0,1,0,0,2,2,...,3,53.85,108.15,1,True,False,False,False,False,False
3,3,1,0,0,0,0,1,0,2,0,...,0,42.3,1840.75,0,False,False,False,True,False,False
4,4,0,0,0,0,1,0,1,0,0,...,2,70.7,151.65,1,True,False,False,False,False,False


In [4]:
df= df.drop('Unnamed: 0', axis=1)

In [5]:
x=df.drop('Churn', axis=1)

In [6]:
x

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,1,0,0,1,0,0,2,0,...,1,2,29.85,29.85,True,False,False,False,False,False
1,1,0,0,0,1,0,0,2,0,2,...,0,3,56.95,1889.50,False,False,True,False,False,False
2,1,0,0,0,1,0,0,2,2,0,...,1,3,53.85,108.15,True,False,False,False,False,False
3,1,0,0,0,0,1,0,2,0,2,...,0,0,42.30,1840.75,False,False,False,True,False,False
4,0,0,0,0,1,0,1,0,0,0,...,1,2,70.70,151.65,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,1,2,0,2,0,2,...,1,3,84.80,1990.50,False,True,False,False,False,False
7039,0,0,1,1,1,2,1,0,2,2,...,1,1,103.20,7362.90,False,False,False,False,False,True
7040,0,0,1,1,0,1,0,2,0,0,...,1,2,29.60,346.45,True,False,False,False,False,False
7041,1,1,1,0,1,2,1,0,0,0,...,1,3,74.40,306.60,True,False,False,False,False,False


In [7]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

### Test train split

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

## Decision Tree Classifier

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
dt=DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [11]:
dt.fit(x_train,y_train)

In [12]:
y_pred=dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 1, 0, 0], dtype=int64)

In [13]:
dt.score(x_test,y_test)

0.7955997161107168

In [14]:
print(classification_report(y_test,y_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1055
           1       0.64      0.44      0.52       354

    accuracy                           0.80      1409
   macro avg       0.73      0.68      0.69      1409
weighted avg       0.78      0.80      0.78      1409



In [15]:
print(metrics.confusion_matrix(y_test,y_pred))

[[967  88]
 [200 154]]


Accuracy is low due to imbalance dataset

### So SMOTEENN (UpSampling + ENN) is used

In [16]:
from imblearn.combine import SMOTEENN

In [17]:
sm=SMOTEENN()
x_resampled,y_resampled=sm.fit_resample(x,y)

In [18]:
xr_train,xr_test,yr_train,yr_test= train_test_split(x_resampled,y_resampled,test_size=0.2)

In [19]:
dt_smote= DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [20]:
dt_smote.fit(xr_train,yr_train)

In [21]:
yr_pred= dt_smote.predict(xr_test)
yr_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
dt_smote.score(xr_test,yr_test)

0.9379194630872483

In [23]:
print(metrics.classification_report(yr_test,yr_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93       556
           1       0.93      0.96      0.94       636

    accuracy                           0.94      1192
   macro avg       0.94      0.94      0.94      1192
weighted avg       0.94      0.94      0.94      1192



In [24]:
print(metrics.confusion_matrix(yr_test,yr_pred))

[[510  46]
 [ 28 608]]


# Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf= RandomForestClassifier(n_estimators=100,criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [27]:
rf.fit(x_train,y_train)

In [28]:
y_pred=rf.predict(x_test)
y_pred

array([0, 0, 1, ..., 1, 0, 0], dtype=int64)

In [29]:
rf.score(x_test,y_test)

0.7991483321504613

In [30]:
print(classification_report(y_test,y_pred,labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.93      0.87      1055
           1       0.66      0.42      0.51       354

    accuracy                           0.80      1409
   macro avg       0.74      0.67      0.69      1409
weighted avg       0.78      0.80      0.78      1409



In [31]:
print(metrics.confusion_matrix(y_test,y_pred))

[[976  79]
 [204 150]]


Accuracy is low due to imbalance dataset

In [32]:
sm=SMOTEENN()
x_resampled,y_resampled=sm.fit_resample(x,y)

In [33]:
xr_train,xr_test,yr_train,yr_test= train_test_split(x_resampled,y_resampled,test_size=0.2)

In [34]:
rf_smote= RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [35]:
rf_smote.fit(xr_train,yr_train)

In [36]:
yr_pred= rf_smote.predict(xr_test)
yr_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [37]:
rf_smote.score(xr_test,yr_test)

0.9161016949152543

In [38]:
print(metrics.classification_report(yr_test,yr_pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       562
           1       0.91      0.94      0.92       618

    accuracy                           0.92      1180
   macro avg       0.92      0.92      0.92      1180
weighted avg       0.92      0.92      0.92      1180



In [39]:
print(metrics.confusion_matrix(yr_test,yr_pred))

[[502  60]
 [ 39 579]]


# Logistic Regression

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [41]:
scaler = StandardScaler()
xr_train_scaled = scaler.fit_transform(xr_train)
xr_test_scaled = scaler.transform(xr_test)

In [42]:
log_reg= LogisticRegression(max_iter=1000)

In [43]:
log_reg.fit(xr_train_scaled,yr_train)

In [44]:
y_pred=log_reg.predict(xr_test_scaled)

In [45]:
y_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [46]:
log_reg.score(xr_test_scaled,yr_test)

0.9127118644067796

In [47]:
print(metrics.classification_report(yr_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       562
           1       0.91      0.93      0.92       618

    accuracy                           0.91      1180
   macro avg       0.91      0.91      0.91      1180
weighted avg       0.91      0.91      0.91      1180



In [48]:
print(metrics.confusion_matrix(yr_test,y_pred))

[[503  59]
 [ 44 574]]


#  K-NN Classifier

In [49]:
from sklearn.neighbors import KNeighborsClassifier  

In [50]:
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  

In [51]:
classifier.fit(xr_train_scaled, yr_train)  

In [52]:
y_pred= classifier.predict(xr_test_scaled)  

In [53]:
y_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [54]:
classifier.score(xr_test_scaled,yr_test)

0.9084745762711864

In [55]:
print(metrics.classification_report(yr_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.86      0.90       562
           1       0.88      0.95      0.92       618

    accuracy                           0.91      1180
   macro avg       0.91      0.91      0.91      1180
weighted avg       0.91      0.91      0.91      1180



In [56]:
print(metrics.confusion_matrix(yr_test,y_pred))

[[484  78]
 [ 30 588]]


# Gradient Boosting Classifier

In [57]:
from sklearn.ensemble import GradientBoostingClassifier

In [58]:
gbc = GradientBoostingClassifier()

In [59]:
gbc.fit(xr_train,yr_train)

In [60]:
y_pred = gbc.predict(xr_test)

In [61]:
y_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [62]:
gbc.score(xr_test,yr_test)

0.9305084745762712

In [63]:
print(metrics.classification_report(yr_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       562
           1       0.93      0.94      0.93       618

    accuracy                           0.93      1180
   macro avg       0.93      0.93      0.93      1180
weighted avg       0.93      0.93      0.93      1180



In [64]:
print(metrics.confusion_matrix(yr_test,y_pred))

[[516  46]
 [ 36 582]]


- Gradient Boosting Classifier gives the best performance with highest accuracy

### HyperParameter Tunning for Gradient Boosting Classifier

In [65]:
param_grid = {'n_estimators':[100, 150, 200, 250, 300],
             'criterion': ['friedman_mse', 'squared_error', 'mse', 'mae'],
             'min_samples_split': [2,3,4,5,6,7,8,9,10],
             'min_samples_leaf': [1,3,5,7,9,11,13,15],'max_leaf_nodes': [3,6,8,9,12,15,18,24],
              'max_depth': [3,5,7,9,11,13,15,17,19],
              'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
              'loss': ['deviance', 'exponential']
              }

In [66]:
gbc_optm = RandomizedSearchCV(estimator=gbc, param_distributions=param_grid,n_iter=100, verbose=3)
gbc_optm.fit(xr_train, yr_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=7, max_leaf_nodes=3, min_samples_leaf=13, min_samples_split=10, n_estimators=100;, score=0.945 total time=   1.5s
[CV 2/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=7, max_leaf_nodes=3, min_samples_leaf=13, min_samples_split=10, n_estimators=100;, score=0.952 total time=   1.5s
[CV 3/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=7, max_leaf_nodes=3, min_samples_leaf=13, min_samples_split=10, n_estimators=100;, score=0.923 total time=   1.5s
[CV 4/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=7, max_leaf_nodes=3, min_samples_leaf=13, min_samples_split=10, n_estimators=100;, score=0.940 total time=   1.5s
[CV 5/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=7, max_leaf_nodes=3, min_samples_leaf=13, min_samples_sp

[CV 5/5] END criterion=mae, learning_rate=0.4, loss=exponential, max_depth=5, max_leaf_nodes=8, min_samples_leaf=13, min_samples_split=3, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END criterion=friedman_mse, learning_rate=0.1, loss=deviance, max_depth=9, max_leaf_nodes=15, min_samples_leaf=9, min_samples_split=6, n_estimators=200;, score=nan total time=   0.0s
[CV 2/5] END criterion=friedman_mse, learning_rate=0.1, loss=deviance, max_depth=9, max_leaf_nodes=15, min_samples_leaf=9, min_samples_split=6, n_estimators=200;, score=nan total time=   0.0s
[CV 3/5] END criterion=friedman_mse, learning_rate=0.1, loss=deviance, max_depth=9, max_leaf_nodes=15, min_samples_leaf=9, min_samples_split=6, n_estimators=200;, score=nan total time=   0.0s
[CV 4/5] END criterion=friedman_mse, learning_rate=0.1, loss=deviance, max_depth=9, max_leaf_nodes=15, min_samples_leaf=9, min_samples_split=6, n_estimators=200;, score=nan total time=   0.0s
[CV 5/5] END criterion=friedman_mse, learning_

[CV 1/5] END criterion=mse, learning_rate=0.2, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=3, n_estimators=200;, score=nan total time=   0.0s
[CV 2/5] END criterion=mse, learning_rate=0.2, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=3, n_estimators=200;, score=nan total time=   0.0s
[CV 3/5] END criterion=mse, learning_rate=0.2, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=3, n_estimators=200;, score=nan total time=   0.0s
[CV 4/5] END criterion=mse, learning_rate=0.2, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=3, n_estimators=200;, score=nan total time=   0.0s
[CV 5/5] END criterion=mse, learning_rate=0.2, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=3, n_estimators=200;, score=nan total time=   0.0s
[CV 1/5] END criterion=squared_error, learning_rate=0.2, loss=exponent

[CV 5/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=9, max_leaf_nodes=12, min_samples_leaf=15, min_samples_split=8, n_estimators=200;, score=0.957 total time=   5.3s
[CV 1/5] END criterion=mae, learning_rate=0.3, loss=deviance, max_depth=11, max_leaf_nodes=15, min_samples_leaf=5, min_samples_split=4, n_estimators=300;, score=nan total time=   0.0s
[CV 2/5] END criterion=mae, learning_rate=0.3, loss=deviance, max_depth=11, max_leaf_nodes=15, min_samples_leaf=5, min_samples_split=4, n_estimators=300;, score=nan total time=   0.0s
[CV 3/5] END criterion=mae, learning_rate=0.3, loss=deviance, max_depth=11, max_leaf_nodes=15, min_samples_leaf=5, min_samples_split=4, n_estimators=300;, score=nan total time=   0.0s
[CV 4/5] END criterion=mae, learning_rate=0.3, loss=deviance, max_depth=11, max_leaf_nodes=15, min_samples_leaf=5, min_samples_split=4, n_estimators=300;, score=nan total time=   0.0s
[CV 5/5] END criterion=mae, learning_rate=0.3, loss=deviance, max

[CV 1/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=7, max_leaf_nodes=15, min_samples_leaf=15, min_samples_split=7, n_estimators=250;, score=0.954 total time=   7.2s
[CV 2/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=7, max_leaf_nodes=15, min_samples_leaf=15, min_samples_split=7, n_estimators=250;, score=0.951 total time=   7.0s
[CV 3/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=7, max_leaf_nodes=15, min_samples_leaf=15, min_samples_split=7, n_estimators=250;, score=0.951 total time=  30.3s
[CV 4/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=7, max_leaf_nodes=15, min_samples_leaf=15, min_samples_split=7, n_estimators=250;, score=0.954 total time=   7.8s
[CV 5/5] END criterion=squared_error, learning_rate=0.1, loss=exponential, max_depth=7, max_leaf_nodes=15, min_samples_leaf=15, min_samples_split=7, n_estimators=250;, score=0.954 total time=   7.1s
[CV 1

[CV 1/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=3, max_leaf_nodes=24, min_samples_leaf=15, min_samples_split=4, n_estimators=150;, score=0.953 total time=   2.5s
[CV 2/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=3, max_leaf_nodes=24, min_samples_leaf=15, min_samples_split=4, n_estimators=150;, score=0.957 total time=   2.8s
[CV 3/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=3, max_leaf_nodes=24, min_samples_leaf=15, min_samples_split=4, n_estimators=150;, score=0.956 total time=   2.9s
[CV 4/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=3, max_leaf_nodes=24, min_samples_leaf=15, min_samples_split=4, n_estimators=150;, score=0.958 total time=   2.8s
[CV 5/5] END criterion=squared_error, learning_rate=0.4, loss=exponential, max_depth=3, max_leaf_nodes=24, min_samples_leaf=15, min_samples_split=4, n_estimators=150;, score=0.953 total time=   2.8s
[CV 1

[CV 4/5] END criterion=friedman_mse, learning_rate=0.5, loss=deviance, max_depth=5, max_leaf_nodes=3, min_samples_leaf=9, min_samples_split=8, n_estimators=200;, score=nan total time=   0.0s
[CV 5/5] END criterion=friedman_mse, learning_rate=0.5, loss=deviance, max_depth=5, max_leaf_nodes=3, min_samples_leaf=9, min_samples_split=8, n_estimators=200;, score=nan total time=   0.0s
[CV 1/5] END criterion=mae, learning_rate=0.05, loss=exponential, max_depth=15, max_leaf_nodes=24, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=nan total time=   0.0s
[CV 2/5] END criterion=mae, learning_rate=0.05, loss=exponential, max_depth=15, max_leaf_nodes=24, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=nan total time=   0.0s
[CV 3/5] END criterion=mae, learning_rate=0.05, loss=exponential, max_depth=15, max_leaf_nodes=24, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=nan total time=   0.0s
[CV 4/5] END criterion=mae, learning_rate=0.05, loss=e

[CV 1/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=19, max_leaf_nodes=12, min_samples_leaf=7, min_samples_split=6, n_estimators=300;, score=0.958 total time=   6.8s
[CV 2/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=19, max_leaf_nodes=12, min_samples_leaf=7, min_samples_split=6, n_estimators=300;, score=0.960 total time=   7.2s
[CV 3/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=19, max_leaf_nodes=12, min_samples_leaf=7, min_samples_split=6, n_estimators=300;, score=0.957 total time=   8.1s
[CV 4/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=19, max_leaf_nodes=12, min_samples_leaf=7, min_samples_split=6, n_estimators=300;, score=0.960 total time=   8.4s
[CV 5/5] END criterion=squared_error, learning_rate=0.3, loss=exponential, max_depth=19, max_leaf_nodes=12, min_samples_leaf=7, min_samples_split=6, n_estimators=300;, score=0.956 total time=   8.9s
[CV 1

[CV 5/5] END criterion=mse, learning_rate=0.4, loss=deviance, max_depth=7, max_leaf_nodes=8, min_samples_leaf=7, min_samples_split=3, n_estimators=100;, score=nan total time=   0.0s
[CV 1/5] END criterion=friedman_mse, learning_rate=0.05, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=9, min_samples_split=6, n_estimators=300;, score=0.946 total time=   7.7s
[CV 2/5] END criterion=friedman_mse, learning_rate=0.05, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=9, min_samples_split=6, n_estimators=300;, score=0.956 total time=   7.5s
[CV 3/5] END criterion=friedman_mse, learning_rate=0.05, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=9, min_samples_split=6, n_estimators=300;, score=0.943 total time=  10.2s
[CV 4/5] END criterion=friedman_mse, learning_rate=0.05, loss=exponential, max_depth=13, max_leaf_nodes=8, min_samples_leaf=9, min_samples_split=6, n_estimators=300;, score=0.948 total time=  10.0s
[CV 5/5] END criterion=fri

[CV 1/5] END criterion=friedman_mse, learning_rate=0.2, loss=exponential, max_depth=19, max_leaf_nodes=9, min_samples_leaf=11, min_samples_split=6, n_estimators=100;, score=0.950 total time=   2.0s
[CV 2/5] END criterion=friedman_mse, learning_rate=0.2, loss=exponential, max_depth=19, max_leaf_nodes=9, min_samples_leaf=11, min_samples_split=6, n_estimators=100;, score=0.954 total time=   2.0s
[CV 3/5] END criterion=friedman_mse, learning_rate=0.2, loss=exponential, max_depth=19, max_leaf_nodes=9, min_samples_leaf=11, min_samples_split=6, n_estimators=100;, score=0.943 total time=   2.1s
[CV 4/5] END criterion=friedman_mse, learning_rate=0.2, loss=exponential, max_depth=19, max_leaf_nodes=9, min_samples_leaf=11, min_samples_split=6, n_estimators=100;, score=0.950 total time=   2.2s
[CV 5/5] END criterion=friedman_mse, learning_rate=0.2, loss=exponential, max_depth=19, max_leaf_nodes=9, min_samples_leaf=11, min_samples_split=6, n_estimators=100;, score=0.951 total time=   2.3s
[CV 1/5] E

370 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\piyush mahajan\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\piyush mahajan\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\piyush mahajan\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\piyush mahajan\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter

In [67]:
gbc_optm.best_estimator_

In [86]:
gbc_tunning = GradientBoostingClassifier(learning_rate=0.5, loss='exponential', max_depth=17,
                           max_leaf_nodes=8, min_samples_leaf=13,
                           min_samples_split=9, n_estimators=200)

In [87]:
gbc_tunning.fit(xr_train, yr_train)

In [88]:
y_pred = gbc_tunning.predict(xr_test)
y_pred

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [89]:
gbc_tunning.score(xr_test,yr_test)

0.9576271186440678

In [90]:
print(metrics.classification_report(yr_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       562
           1       0.96      0.96      0.96       618

    accuracy                           0.96      1180
   macro avg       0.96      0.96      0.96      1180
weighted avg       0.96      0.96      0.96      1180



In [91]:
print(metrics.confusion_matrix(yr_test,y_pred))

[[537  25]
 [ 25 593]]


- After Oversampling the dataset, the model performance is pretty good. From all the model **Gradient Boost Classifier** performs better than all. So we dumb our model using pickle library

## Model Saving:

In [121]:
import pickle

In [122]:
filename = ' Churn Prediction Model.sav'
pickle.dump(gbc_tunning, open(filename,'wb'))

In [123]:
load_model = pickle.load(open(filename, 'rb'))

In [124]:
load_model.score(xr_test, yr_test)

0.9576271186440678

## Testing

In [166]:
gender = 'Male'
SeniorCitizen = 'No'
Partner = 'No'
Dependents = 'No'
tenure = 2
PhoneService ='Yes'
MultipleLines = 'No'
InternetService = 'DSL'
OnlineSecurity = 'Yes'
OnlineBackup = 'Yes'
DeviceProtection = 'No'
TechSupport = 'No'
StreamingTV = 'No'
StreamingMovies = 'No'
Contract = 'Month-to-month'
PaperlessBilling = 'Yes'
PaymentMethod = 'Mailed check'
MonthlyCharges = 53.85
TotalCharges = 108.15

In [167]:
data = [[gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges]]

In [168]:
df1 = pd.DataFrame(data, columns=['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges'])
df1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15


In [169]:
print(df1.dtypes)

gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object


In [170]:
for feature in df1.columns:
    if df1[feature].dtypes =='O':
        categorical_feature = feature
        print(categorical_feature)

gender
SeniorCitizen
Partner
Dependents
PhoneService
MultipleLines
InternetService
OnlineSecurity
OnlineBackup
DeviceProtection
TechSupport
StreamingTV
StreamingMovies
Contract
PaperlessBilling
PaymentMethod


In [171]:
encoder= LabelEncoder()
for feature in df1.columns:
    if df1[feature].dtypes == 'O':
        df1[feature] = encoder.fit_transform(df1[feature]) 

In [172]:
df1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,53.85,108.15


In [173]:
labels = ["{0} - {1}".format(i, i + 11) for i in range(1, 72, 12)]

df1['tenure_group'] = pd.cut(df1.tenure, range(1, 80, 12), right=False, labels=labels)

In [174]:
df1_dummies = pd.get_dummies(df1)
df1_dummies.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,0,0,2,0,0,0,0,0,...,0,0,53.85,108.15,True,False,False,False,False,False


In [175]:
df1_dummies.drop(columns=['tenure'],axis=1, inplace=True)

In [176]:
df1_dummies

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,0,0,0,0,0,0,0,0,...,0,0,53.85,108.15,True,False,False,False,False,False


In [180]:
prediction = load_model.predict(df1_dummies)
probability = load_model.predict_proba(df1_dummies)[:,1]

In [181]:
print(prediction)

[1]


In [182]:
print(probability)

[0.99999997]


In [185]:
if prediction == 1:
    print("This Customer is likely to be Churned!")
    print(f"Confidence level is {np.round(probability*100, 2)}")
else:
    print("This Customer is likely to Continue!")
    print(f"Confidence level is {np.round(probability*100, 2)}")

This Customer is likely to be Churned!
Confidence level is [100.]
