### Important Libraries to important


In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

#### Link to dataset is: https://www.kaggle.com/blastchar/telco-customer-churn

In [None]:
data= pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#####  Info of our dataset is mention below that the columns are of which data type

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


#### 

In [None]:
print ("Number of rows in the dataset  : " ,data.shape[0])
print ("Number of Columns in the dataset : " ,data.shape[1])
print ("Number of Features : \n" ,data.columns.tolist())
print ("Missing values :  ", data.isnull().sum().values.sum())
print ("Unique values :  \n",data.nunique())

Number of rows in the dataset  :  7043
Number of Columns in the dataset :  21
Number of Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
Missing values :   0
Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int

####  So we clearly say that there are missing values in each column. Removing the missing value from the Total Charges, NaN values are present there in the dataset by replacing the value using the regression.

In [None]:


data['TotalCharges']=data["TotalCharges"].replace(r'\s+',np.nan,regex=True)
data['TotalCharges']=pd.to_numeric(data['TotalCharges'])

In [None]:
fill=data.MonthlyCharges*data.tenure

In [None]:
data.TotalCharges.fillna(fill,inplace=True)

In [None]:
#data.isnull().sum()

####  Now the changing the datatype of the columns from object to numerical datat type. And also doing the one hot encoding and label encoding to the columns of the data set which contains the categorical data.

In [None]:
df=data

In [None]:
def changeColumnsToString(df):
    columnsNames=['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
    for col in columnsNames:
        df[col]=df[col].astype('str').str.replace('Yes','1').replace('No','0').replace('No internet service','0').replace('No phone service',0)

changeColumnsToString(df)

df['SeniorCitizen']=df['SeniorCitizen'].astype(bool)
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')

In [None]:
#df.head()

In [None]:

print("Payment methods: ",df.PaymentMethod.unique())
print("Contract types: ",df.Contract.unique())
print("Gender: ",df.gender.unique())
print("Senior Citizen: ",df.SeniorCitizen.unique())
print("Internet Service Types: ",df.InternetService.unique())

Payment methods:  ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Contract types:  ['Month-to-month' 'One year' 'Two year']
Gender:  ['Female' 'Male']
Senior Citizen:  [False  True]
Internet Service Types:  ['DSL' 'Fiber optic' 'No']


In [None]:

df['gender']=df['gender'].astype('category')
df['PaymentMethod']=df['PaymentMethod'].astype('category')
df['Contract']=df['Contract'].astype('category')
df['SeniorCitizen']=df['SeniorCitizen'].astype('category')
df['InternetService']=df['InternetService'].astype('category')
#df.dtypes

In [None]:
dfPaymentDummies = pd.get_dummies(df['PaymentMethod'], prefix = 'payment')
dfContractDummies = pd.get_dummies(df['Contract'], prefix = 'contract')
dfGenderDummies = pd.get_dummies(df['gender'], prefix = 'gender')
dfSeniorCitizenDummies = pd.get_dummies(df['SeniorCitizen'], prefix = 'SC')
dfInternetServiceDummies = pd.get_dummies(df['InternetService'], prefix = 'IS')

'''
print(dfPaymentDummies.head(3))
print(dfContractDummies.head(3))
print(dfGenderDummies.head(3))
print(dfSeniorCitizenDummies.head(3))
print(dfInternetServiceDummies.head(3))
'''

'\nprint(dfPaymentDummies.head(3))\nprint(dfContractDummies.head(3))\nprint(dfGenderDummies.head(3))\nprint(dfSeniorCitizenDummies.head(3))\nprint(dfInternetServiceDummies.head(3))\n'

In [None]:

df.drop(['gender','PaymentMethod','Contract','SeniorCitizen','InternetService'], axis=1, inplace=True)

df = pd.concat([df, dfPaymentDummies], axis=1)
df = pd.concat([df, dfContractDummies], axis=1)
df = pd.concat([df, dfGenderDummies], axis=1)
df = pd.concat([df, dfSeniorCitizenDummies], axis=1)
df = pd.concat([df, dfInternetServiceDummies], axis=1)
df.head(2)

Unnamed: 0,customerID,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,contract_Month-to-month,contract_One year,contract_Two year,gender_Female,gender_Male,SC_False,SC_True,IS_DSL,IS_Fiber optic,IS_No
0,7590-VHVEG,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,1,0,1,0,0
1,5575-GNVDE,0,0,34,1,0,1,0,1,0,...,0,1,0,0,1,1,0,1,0,0


In [None]:

df.columns = ['customerID', 'Partner', 'Dependents', 'tenure', 'PhoneService',
       'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'Churn',
       'payment_Bank_transfer_auto', 'payment_Credit_card_auto',
       'payment_Electronic_check', 'payment_Mailed_check',
       'contract_Month_to_month', 'contract_One_year', 'contract_Two_year',
       'gender_Female', 'gender_Male', 'SC_False', 'SC_True', 'IS_DSL',
       'IS_Fiber_optic', 'IS_No']

In [None]:

numericColumns=np.array(['Partner', 'Dependents', 'tenure', 'PhoneService',
       'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges', 'Churn',
       'payment_Bank_transfer_auto', 'payment_Credit_card_auto',
       'payment_Electronic_check', 'payment_Mailed_check',
       'contract_Month_to_month', 'contract_One_year', 'contract_Two_year',
       'gender_Female', 'gender_Male', 'SC_False', 'SC_True', 'IS_DSL',
       'IS_Fiber_optic', 'IS_No'])

for columnName in numericColumns:
    df[columnName]=pd.to_numeric(df[columnName],errors='coerce')
#df.dtypes

#### Finally after dropping the non- essential columns and doing one hot encoding and label encoding. Our final dataset is :

In [None]:
df.head()

Unnamed: 0,customerID,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,contract_Month_to_month,contract_One_year,contract_Two_year,gender_Female,gender_Male,SC_False,SC_True,IS_DSL,IS_Fiber_optic,IS_No
0,7590-VHVEG,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,1,0,1,0,0
1,5575-GNVDE,0,0,34,1,0,1,0,1,0,...,0,1,0,0,1,1,0,1,0,0
2,3668-QPYBK,0,0,2,1,0,1,1,0,0,...,1,0,0,0,1,1,0,1,0,0
3,7795-CFOCW,0,0,45,0,0,1,0,1,1,...,0,1,0,0,1,1,0,1,0,0
4,9237-HQITU,0,0,2,1,0,0,0,0,0,...,1,0,0,1,0,1,0,0,1,0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.Churn.value_counts()


0    5174
1    1869
Name: Churn, dtype: int64

In [None]:
#spliting the testing and training data 


df_test=df[5001:]
df_train=df[:5001]


X_test = df_test.drop('Churn', axis=1)
Y_test = df_test['Churn']

### <font color='red'>Up and Down sampling using the Sklearn</font>

In [None]:
# Separate majority and minority classes
df_majority = df_train[df_train.Churn==0]
df_minority = df_train[df_train.Churn==1]

In [None]:
df_majority.shape,df_minority.shape

((3688, 30), (1313, 30))

In [None]:
from sklearn.utils import resample

In [None]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=3688,    # to match majority class
                                 random_state=123) 

### Upsampling on the dataset is done for increasing the size of the minority class in the dataset

In [None]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [None]:
df_upsampled

Unnamed: 0,customerID,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,contract_Month_to_month,contract_One_year,contract_Two_year,gender_Female,gender_Male,SC_False,SC_True,IS_DSL,IS_Fiber_optic,IS_No
0,7590-VHVEG,1,0,1,0,0,0,1,0,0,...,1,0,0,1,0,1,0,1,0,0
1,5575-GNVDE,0,0,34,1,0,1,0,1,0,...,0,1,0,0,1,1,0,1,0,0
3,7795-CFOCW,0,0,45,0,0,1,0,1,1,...,0,1,0,0,1,1,0,1,0,0
6,1452-KIOVK,0,1,22,1,1,0,1,0,0,...,1,0,0,0,1,1,0,0,1,0
7,6713-OKOMC,0,0,10,0,0,1,0,0,0,...,1,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,9057-MSWCO,1,0,27,0,0,1,0,0,0,...,1,0,0,0,1,0,1,1,0,0
2,3668-QPYBK,0,0,2,1,0,1,1,0,0,...,1,0,0,0,1,1,0,1,0,0
4503,9658-WYUFB,0,0,17,1,1,1,1,0,0,...,1,0,0,1,0,1,0,0,1,0
1771,7156-MHUGY,0,0,13,1,0,0,0,0,0,...,1,0,0,0,1,0,1,0,1,0


In [None]:
df_up=df_upsampled.sample(frac=1)

In [None]:
df_up['Churn'].value_counts()

1    3688
0    3688
Name: Churn, dtype: int64

In [None]:
# Dividing the dataset into two part one having onlty the target value and other having all other columns
X_up = df_up.drop('Churn', axis=1)
Y_up = df_up['Churn']

X_up.shape, X_test.shape,Y_up.shape, Y_test.shape

((7376, 29), (2042, 29), (7376,), (2042,))

In [None]:
X_up=X_up.drop('customerID', axis=1)
X_test=X_test.drop('customerID', axis=1)

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_up,Y_up)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred,Y_test)
cf_matrix

accuracy_score :  0.7487757100881489
precision_score :  0.5266418835192069
recall_score :  0.7643884892086331
f1_score :  0.623624358033749
              precision    recall  f1-score   support

     class 0       0.89      0.74      0.81      1486
     class 1       0.53      0.76      0.62       556

    accuracy                           0.75      2042
   macro avg       0.71      0.75      0.72      2042
weighted avg       0.79      0.75      0.76      2042



array([[1104,  131],
       [ 382,  425]], dtype=int64)

### Downsampling on the dataset is done for decreasing the size of the majority class in the dataset

In [None]:
df_majority_downsampled = resample(df_majority, 
                                 replace=True,    # sample without replacement
                                 n_samples=1313,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [None]:
df_down=df_downsampled.sample(frac=1)

In [None]:
df_down['Churn'].value_counts()

1    1313
0    1313
Name: Churn, dtype: int64

In [None]:

# Dividing the dataset into two part one having onlty the target value and other having all other columns
X_down = df_down.drop('Churn', axis=1)
Y_down= df_down['Churn']

X_down.shape, X_test.shape,Y_down.shape, Y_test.shape

((2626, 29), (2042, 28), (2626,), (2042,))

In [None]:
X_down=X_down.drop('customerID', axis=1)
#X_test=X_test.drop('customerID', axis=1)

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_down, Y_down)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.7463271302644466
precision_score :  0.5233415233415234
recall_score :  0.7661870503597122
f1_score :  0.6218978102189782
              precision    recall  f1-score   support

     class 0       0.89      0.74      0.81      1486
     class 1       0.52      0.77      0.62       556

    accuracy                           0.75      2042
   macro avg       0.71      0.75      0.72      2042
weighted avg       0.79      0.75      0.76      2042



array([[1098,  130],
       [ 388,  426]], dtype=int64)

### <font color='red'>SMOTE</font>

SMOTE stands for Synthetic Minority Oversampling Technique. This is a statistical technique for increasing the number of cases in your dataset in a balanced way. The module works by generating new instances from existing minority cases that you supply as input.

In [None]:
df_train.Churn.value_counts()


0    3688
1    1313
Name: Churn, dtype: int64

In [None]:
X_train_smote=df_train.drop(['customerID','Churn'],axis=1)
Y_train_smote=df_train['Churn']
X_train_smote.shape,Y_train_smote.shape

((5001, 28), (5001,))

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train_smote, Y_train_smote.ravel())

In [None]:
X_train_res.shape, y_train_res.shape

((7376, 28), (7376,))

In [None]:
len(y_train_res[y_train_res==0]),len(y_train_res[y_train_res==1])

(3688, 3688)

In [None]:

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_res, y_train_res)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.7850146914789422
precision_score :  0.6017391304347826
recall_score :  0.6223021582733813
f1_score :  0.6118479221927499
              precision    recall  f1-score   support

     class 0       0.86      0.85      0.85      1486
     class 1       0.60      0.62      0.61       556

    accuracy                           0.79      2042
   macro avg       0.73      0.73      0.73      2042
weighted avg       0.79      0.79      0.79      2042



array([[1257,  210],
       [ 229,  346]], dtype=int64)

### <font color='red'>ADASYN</font>

ADASYN is based on the idea of adaptively generating minority data samples according to their distributions: more synthetic data is generated for minority class samples that are harder to learn compared to those minority samples that are easier to learn.


In [None]:
X_train_adas=df_train.drop(['customerID','Churn'],axis=1)
Y_train_adas=df_train['Churn']
X_train_adas.shape,Y_train_adas.shape

((5001, 28), (5001,))

In [None]:
from imblearn.over_sampling import ADASYN
sm = ADASYN(random_state = 2)
X_train_ada, y_train_ada = sm.fit_resample(X_train_adas, Y_train_adas.ravel())

In [None]:
X_train_ada.shape, y_train_ada.shape

((7371, 28), (7371,))

In [None]:
len(y_train_ada[y_train_ada==0]),len(y_train_ada[y_train_ada==1])

(3688, 3683)

In [None]:

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_ada, y_train_ada)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.7928501469147894
precision_score :  0.6276391554702495
recall_score :  0.5881294964028777
f1_score :  0.607242339832869
              precision    recall  f1-score   support

     class 0       0.85      0.87      0.86      1486
     class 1       0.63      0.59      0.61       556

    accuracy                           0.79      2042
   macro avg       0.74      0.73      0.73      2042
weighted avg       0.79      0.79      0.79      2042



array([[1292,  229],
       [ 194,  327]], dtype=int64)

### <font color='red'>SMOTE + ENN</font>

SMOTE + ENN is another hybrid technique where more no. of observations are removed from the sample space. Here, ENN is yet another undersampling technique where the nearest neighbors of each of the majority class is estimated. If the nearest neighbors misclassify that particular instance of the majority class, then that instance gets deleted.
Integrating this technique with oversampled data done by SMOTE helps in doing extensive data cleaning. Here on misclassification by NN’s samples from both the classes are removed. This results in a more clear and concise class separation.

In [None]:
X_train_se=df_train.drop(['customerID','Churn'],axis=1)
Y_train_se=df_train['Churn']
X_train_se.shape,Y_train_se.shape

((5001, 28), (5001,))

In [None]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN(random_state = 2)
X_train_sen, y_train_sen = sm.fit_resample(X_train_se, Y_train_se.ravel())

In [None]:
X_train_sen.shape,y_train_sen.shape

((4246, 28), (4246,))

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_sen, y_train_sen)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.7605288932419196
precision_score :  0.5472496473906912
recall_score :  0.697841726618705
f1_score :  0.6134387351778656
              precision    recall  f1-score   support

     class 0       0.87      0.78      0.83      1486
     class 1       0.55      0.70      0.61       556

    accuracy                           0.76      2042
   macro avg       0.71      0.74      0.72      2042
weighted avg       0.79      0.76      0.77      2042



array([[1165,  168],
       [ 321,  388]], dtype=int64)