In [6]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')


In [7]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))


Apply SMOTE for upsampling the data

Use logistic regression to fit the model and compute the accuracy of the model.
Use decision tree classifier to fit the model and compute the accuracy of the model.
Compare the accuracies of the two models

In [11]:
from sklearn.preprocessing import StandardScaler 
from imblearn.over_sampling import SMOTE 
smote = SMOTE() 
y = churnData['Churn'] 
X = churnData.drop(['Churn'],axis=1)
X = pd.get_dummies(X,drop_first=True)
X= pd.DataFrame(X)
transformer = StandardScaler().fit(X) 
X = transformer.transform(X) 
X_sm, y_sm = smote.fit_resample(X, y) 
y_sm.value_counts()



No     5174
Yes    5174
Name: Churn, dtype: int64

In [17]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression(solver='lbfgs')
#from sklearn.ensemble import RandomForestClassifier
#model3 = RandomForestClassifier(max_depth=2, random_state=0)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2)

In [14]:
model_pipeline = [model1, model2]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=25))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Logistic Regression': 0.8065926546063409, 'Decision Tree Classifier': 0.8796844174280203}


Apply TomekLinks for downsampling

It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
Use logistic regression to fit the model and compute the accuracy of the model.
Use decision tree classifier to fit the model and compute the accuracy of the model.
Compare the accuracies of the two models.
You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [16]:
from imblearn.under_sampling import TomekLinks 
tome = TomekLinks() 
X_tl, y_tl = tome.fit_resample(X, y) 
y_tl.value_counts()

No     5047
Yes    1869
Name: Churn, dtype: int64

TomeLinks does not solve imbalance in the two classes

In [18]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tl, y_tl, test_size=0.2)

In [19]:
scores1 = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train1, y_train1, cv=25))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

{'Logistic Regression': 0.7465883983531042, 'Decision Tree Classifier': 0.7772989278871631}
