In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Customer-Churn.csv')

In [3]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [4]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [5]:
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
nulls.sort_values('percentage', ascending = False)

Unnamed: 0,percentage
gender,0.0
SeniorCitizen,0.0
Partner,0.0
Dependents,0.0
tenure,0.0
PhoneService,0.0
OnlineSecurity,0.0
OnlineBackup,0.0
DeviceProtection,0.0
TechSupport,0.0


In [7]:
data['TotalCharges'] = data['TotalCharges'].replace([' '],'0')

In [8]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [9]:
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
X_train, X_test, y_train_, y_test_ = train_test_split(X, y, test_size=0.25, random_state=42)

scaler.fit(X_train)
X_train_, X_test_ = (scaler.transform(X_train), scaler.transform(X_test))

In [11]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_, y_train_)
y_pred = logreg.predict(X_test_)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_, y_test_)))

Accuracy of logistic regression classifier on test set: 0.80


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test_, y_pred))

              precision    recall  f1-score   support

          No       0.82      0.93      0.87      1282
         Yes       0.70      0.46      0.55       479

    accuracy                           0.80      1761
   macro avg       0.76      0.69      0.71      1761
weighted avg       0.79      0.80      0.78      1761



In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test_, y_pred)
print(confusion_matrix)

[[1189   93]
 [ 260  219]]


In [14]:
data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

We have high data imbalance towards the no values

TRYING DIFFERENT WAYS TO DEAL WITH IMBALACE

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X), y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.74


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.77      0.71      0.74      1574
         Yes       0.72      0.78      0.75      1531

    accuracy                           0.74      3105
   macro avg       0.74      0.74      0.74      3105
weighted avg       0.74      0.74      0.74      3105



In [18]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X), y)

y_tl.value_counts()

No     4620
Yes    1869
Name: Churn, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))


Accuracy of logistic regression classifier on test set: 0.79


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.82      0.89      0.86      1379
         Yes       0.67      0.54      0.60       568

    accuracy                           0.79      1947
   macro avg       0.75      0.71      0.73      1947
weighted avg       0.78      0.79      0.78      1947



Scaling and then using smote / tomek

In [21]:
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

In [22]:
scaler = StandardScaler()
X_train, X_test, y_train_, y_test_ = train_test_split(X, y, test_size=0.25, random_state=42)

scaler.fit(X_train)
X_train_, X_test_ = (scaler.transform(X_train), scaler.transform(X_test))

In [23]:
smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X_train_), y_train_)
y_sm.value_counts()

No     3892
Yes    3892
Name: Churn, dtype: int64

In [24]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_sm, y_sm)
y_pred = logreg.predict(X_test_)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_, y_test_)))

Accuracy of logistic regression classifier on test set: 0.74


In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test_, y_pred))

              precision    recall  f1-score   support

          No       0.90      0.73      0.80      1282
         Yes       0.51      0.77      0.62       479

    accuracy                           0.74      1761
   macro avg       0.71      0.75      0.71      1761
weighted avg       0.79      0.74      0.75      1761



In [26]:
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

In [27]:
scaler = StandardScaler()
X_train, X_test, y_train_, y_test_ = train_test_split(X, y, test_size=0.25, random_state=42)

scaler.fit(X_train)
X_train_, X_test_ = (scaler.transform(X_train), scaler.transform(X_test))

In [28]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_train_), y_train_)

y_tl.value_counts()

No     3489
Yes    1390
Name: Churn, dtype: int64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.79


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.90      0.86      1048
         Yes       0.67      0.52      0.59       416

    accuracy                           0.79      1464
   macro avg       0.75      0.71      0.72      1464
weighted avg       0.78      0.79      0.78      1464

