In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [5]:
## load dataset
churnData = pd.read_csv('customer_churn.csv') # this file is in files_for_lab folder
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
## prepare variables
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
y = churnData[['Churn']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

In [14]:
## model
model = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_train, y_train)

model.score(X_test, y_test)

0.7821149751596878

In [21]:
y_train['Churn'].value_counts()

No     3108
Yes    1117
Name: Churn, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

In [23]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_sample(X_train, y_train)
y_sm['Churn'].value_counts()

Yes    3108
No     3108
Name: Churn, dtype: int64

In [24]:
## model
model = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_sm, y_sm)

model.score(X_test, y_test)

0.7210787792760823

In [25]:
from sklearn.metrics import confusion_matrix

y_pred=model.predict(X_test)
confusion_matrix(y_pred, y_test)

array([[1483,  203],
       [ 583,  549]])

In [28]:
pd.Series(y_pred).value_counts()

No     1686
Yes    1132
dtype: int64

In [29]:
y_test['Churn'].value_counts()

No     2066
Yes     752
Name: Churn, dtype: int64

In [30]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X_train, y_train)

In [32]:
## model
model = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X_tl, y_tl)

model.score(X_test, y_test)

0.7679205110007097

In [33]:
from sklearn.metrics import confusion_matrix

y_pred=model.predict(X_test)
confusion_matrix(y_pred, y_test)

array([[1776,  364],
       [ 290,  388]])

In [34]:
pd.Series(y_pred).value_counts()

No     2140
Yes     678
dtype: int64

In [35]:
y_test['Churn'].value_counts()

No     2066
Yes     752
Name: Churn, dtype: int64