In [11]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Customer-Churn.csv')

In [3]:
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
nulls.sort_values('percentage', ascending = False)

Unnamed: 0,percentage
gender,0.0
SeniorCitizen,0.0
Partner,0.0
Dependents,0.0
tenure,0.0
PhoneService,0.0
OnlineSecurity,0.0
OnlineBackup,0.0
DeviceProtection,0.0
TechSupport,0.0


In [4]:
data['TotalCharges'] = data['TotalCharges'].replace([' '],'0')

In [5]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [6]:
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
y = data['Churn']

# smote

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X), y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.73


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.77      0.68      0.72      1574
         Yes       0.71      0.79      0.75      1531

    accuracy                           0.73      3105
   macro avg       0.74      0.74      0.73      3105
weighted avg       0.74      0.73      0.73      3105



In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1067  507]
 [ 316 1215]]


In [16]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)


0.77487922705314

In [24]:
y_pred = model.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.78      0.78      0.78      1574
         Yes       0.77      0.77      0.77      1531

    accuracy                           0.77      3105
   macro avg       0.77      0.77      0.77      3105
weighted avg       0.77      0.77      0.77      3105



# tomek links

In [28]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X), y)

y_tl.value_counts()

No     4620
Yes    1869
Name: Churn, dtype: int64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.79


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.82      0.89      0.86      1379
         Yes       0.67      0.54      0.60       568

    accuracy                           0.79      1947
   macro avg       0.75      0.71      0.73      1947
weighted avg       0.78      0.79      0.78      1947



In [31]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7370313302516692

In [32]:
y_pred = model.predict(X_test)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.81      0.82      0.82      1379
         Yes       0.55      0.53      0.54       568

    accuracy                           0.74      1947
   macro avg       0.68      0.68      0.68      1947
weighted avg       0.73      0.74      0.74      1947



second run

In [35]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_tl), y_tl)

y_tl.value_counts()

No     4453
Yes    1869
Name: Churn, dtype: int64

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.80


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.89      0.86      1337
         Yes       0.68      0.58      0.63       560

    accuracy                           0.80      1897
   macro avg       0.76      0.73      0.74      1897
weighted avg       0.79      0.80      0.79      1897



In [38]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7496046389035319

In [39]:
y_pred = model.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.83      0.82      0.82      1337
         Yes       0.57      0.59      0.58       560

    accuracy                           0.75      1897
   macro avg       0.70      0.70      0.70      1897
weighted avg       0.75      0.75      0.75      1897

