In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [64]:
data = pd.read_csv('customer_churn.csv')

In [67]:
pd.set_option('display.max_columns', None)
display(data.head(5))
data.shape

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


(7043, 21)

In [66]:
data['Churn'] = data['Churn'].str.replace(r'No','0')
data['Churn'] = data['Churn'].str.replace(r'Yes','1')

In [5]:
data['Churn'] = pd.to_numeric(data['Churn'])
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

In [6]:
data2 = data[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'Churn']].copy()
data2

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,0
1,34,0,56.95,0
2,2,0,53.85,1
3,45,0,42.30,0
4,2,0,70.70,1
...,...,...,...,...
7038,24,0,84.80,0
7039,72,0,103.20,0
7040,11,0,29.60,0
7041,4,1,74.40,1


In [7]:
data2.dtypes

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
Churn               int64
dtype: object

In [8]:
data2.isna().sum()

tenure            0
SeniorCitizen     0
MonthlyCharges    0
Churn             0
dtype: int64

In [9]:
data2['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [10]:
X = data2.drop('Churn',axis = 1)
y = data2['Churn']

In [11]:
# correlations_matrix = X.corr()
# sns.heatmap(correlations_matrix, annot=True)
# plt.show()

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
transformer = MinMaxScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)

X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,0.083333,0.0,0.310448
1,0.055556,0.0,0.670647
2,0.694444,0.0,0.246766
3,0.944444,0.0,0.912935
4,0.333333,0.0,0.568657


In [38]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

In [39]:
# the bad model
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)

0.787052810902896

In [40]:
#while accuracy is not absolutely terrible, a closer look reveals some serious problems
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.6319018404907976
recall:  0.44685466377440347
f1:  0.5235069885641678


In [41]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[1180,  120],
       [ 255,  206]], dtype=int64)

In [42]:
#now for the good model

In [43]:
from imblearn.over_sampling import SMOTE

In [44]:
sm = SMOTE(k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [45]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4935251798561151
recall:  0.7440347071583514
f1:  0.5934256055363323


In [46]:
confusion_matrix(y_test,pred)

array([[948, 352],
       [118, 343]], dtype=int64)

In [47]:
# smote almost halfed the lower left field (predicted stay, actually left),
# which is what we want here

In [48]:
# Bonus:

In [49]:
from sklearn.utils import resample

In [50]:
train = pd.concat([X_train_scaled, y_train],axis=1)
train.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,0.083333,0.0,0.310448,0
1,0.055556,0.0,0.670647,1
2,0.694444,0.0,0.246766,0
3,0.944444,0.0,0.912935,1
4,0.333333,0.0,0.568657,1


In [51]:
# separate majority/minority classes
no_data2 = train[train['Churn']==0]
yes_data2 = train[train['Churn']==1]

In [52]:
# undersample majority
no_data2_undersampled = resample(no_data2, replace=False, n_samples = len(yes_data2))

In [56]:
train_undersampled = pd.concat([yes_data2,no_data2_undersampled], axis=0)
train_undersampled.head()
train_undersampled.isna().sum()
# train_undersampled.shape

tenure            0
SeniorCitizen     0
MonthlyCharges    0
Churn             0
dtype: int64

In [57]:
y_train_under = train_undersampled['Churn'].copy()
X_train_under = train_undersampled.drop('Churn',axis = 1).copy()

In [59]:
LR = LogisticRegression(solver='lbfgs')
LR.fit(X_train_under, y_train_under)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.4935064935064935
recall:  0.7418655097613883
f1:  0.5927209705372617


In [60]:
confusion_matrix(y_test,pred)

array([[949, 351],
       [119, 342]], dtype=int64)