In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import warnings
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
#read the data
churnData = pd.read_csv('customer_churn.csv',sep=",")
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
#Checking the values of the target
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [4]:
#Seperating the numerical columns
numericals = churnData.select_dtypes(include="number")
numericals.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [5]:
#Scaling the numericals
transformer = StandardScaler().fit(numericals)
standard_x = transformer.transform(numericals)
X = pd.DataFrame(standard_x)
X.head()

Unnamed: 0,0,1,2
0,-0.439916,-1.277445,-1.160323
1,-0.439916,0.066327,-0.259629
2,-0.439916,-1.236724,-0.36266
3,-0.439916,0.514251,-0.746535
4,-0.439916,-1.236724,0.197365


*Seperating the target and numericals*

In [6]:
X.columns = numericals.columns
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,-0.439916,-1.277445,-1.160323
1,-0.439916,0.066327,-0.259629
2,-0.439916,-1.236724,-0.36266
3,-0.439916,0.514251,-0.746535
4,-0.439916,-1.236724,0.197365


In [7]:
y = churnData["Churn"]
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [8]:
#Spliting the training and testing X-y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
#Calculating the number of rows of the numerical columns for being used for the modelling (30% of the numerical data)
len(X_test)

2113

In [10]:
#applying logistic classification model
classing = LogisticRegression(random_state=0, multi_class = 'ovr').fit(X_train, y_train)
predictions = classing.predict(X_test)

In [11]:
#Using confusion matrix to check the efficiency of the model
confusion_matrix(y_test,predictions)

array([[1420,  119],
       [ 317,  257]], dtype=int64)

*(tn, fp, fn, tp) We have 1420 No that have been predicted correctly, while we have 119 Yes that have been predicted incorrectly. On the other hand, we have 317 No that have been predicted incorrectly and 257 yes that have been predicted correctly. So the number of no answers being predicted wrong is higher than the number of No answers being predicted correctly.*

In [12]:
#accuracy
classing.score(X_test,y_test)

0.7936583057264552

*Even a simple model will give us more than 70% accuracy. Why?
Because we have an imbalanced data and the number of No answers are much less than yes. So the model is already good at predicting the yes answers as they are the majority. However, the model is not good at predicting the Nos. So, what we can do is increasing the number of no samples or reducing the number of yes samples to have a more balanced dataframe*

# Synthetic Minority Oversampling TEchnique (SMOTE) 

In [13]:
#Increasing the number of samples of the minority class by computing the difference between the two points and multiplying it by a random number between 0 and 1
from imblearn.over_sampling import SMOTE
smote = SMOTE()
#Taking the numerical values
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
#seperating the target
y = churnData['Churn']
#oversampling
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [14]:
#applying logistic regression to predict the target
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)
#Accuracy of the predictions
classification.score(X_test, y_test)

0.7394524959742351

In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.74      0.74      0.74      1557
         Yes       0.74      0.73      0.74      1548

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



# Downsampling by Tomeklinks

In [17]:
#using tomeklinks to reduce the number of noise in the bigger dataset 
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

No     4697
Yes    1869
Name: Churn, dtype: int64

In [19]:
#applying the logistic regression and predicting the churn 
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)
#Calculating the accuracy
classification.score(X_test, y_test)

0.8040609137055837

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1397
         Yes       0.73      0.52      0.61       573

    accuracy                           0.80      1970
   macro avg       0.78      0.72      0.74      1970
weighted avg       0.80      0.80      0.79      1970



*The accuracy and recall and f1-score increased after using tomeklinks. So this is the best method to be chosen in case of this data for having a balanced data*