In [405]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [406]:
df = pd.read_csv('../data/external/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Data Preparation

In [407]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [408]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [409]:
#Look at datatypes
df.dtypes


customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [410]:
#total charges should be numeric but appear to be object. convert to a numeric type
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)


In [411]:
# convert churn to a binary variable
df.churn = (df.churn == 'yes').astype(int)

# Setting Up the Validation Framework

In [412]:
from sklearn.model_selection import train_test_split

In [413]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [414]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

# Exploratory Data Analysis

In [415]:
# check for missing values
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [416]:
df_full_train.churn.value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [417]:
global_churn_rate = df_full_train.churn.mean()
float(round(global_churn_rate, 2))

0.27

In [418]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [419]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [420]:
categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]
df_full_train[categorical].nunique()


gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

## Feature Importance

In [421]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [422]:
churn_female = float(df_full_train[df_full_train.gender == 'female'].churn.mean())
churn_male = float(df_full_train[df_full_train.gender == 'male'].churn.mean())
churn_female, churn_male


(0.27682403433476394, 0.2632135306553911)

In [423]:
churn_senior = float(df_full_train[df_full_train.seniorcitizen == 1].churn.mean())
churn_non_senior = float(df_full_train[df_full_train.seniorcitizen == 0].churn.mean())

non_senior_risk_ratio = float(churn_non_senior / global_churn_rate)
senior_risk_ratio = float(churn_senior / global_churn_rate)
non_senior_risk_ratio, senior_risk_ratio


(0.8974033167171667, 1.5312078272604588)

In [424]:
from IPython.display import display

In [425]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk_ratio'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk_ratio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498



seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk_ratio
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208



partner


Unnamed: 0_level_0,mean,count,diff,risk_ratio
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472



dependents


Unnamed: 0_level_0,mean,count,diff,risk_ratio
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651



phoneservice


Unnamed: 0_level_0,mean,count,diff,risk_ratio
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412



multiplelines


Unnamed: 0_level_0,mean,count,diff,risk_ratio
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948



internetservice


Unnamed: 0_level_0,mean,count,diff,risk_ratio
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201



onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk_ratio
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757



onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk_ratio
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466



deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk_ratio
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348



techsupport


Unnamed: 0_level_0,mean,count,diff,risk_ratio
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239



streamingtv


Unnamed: 0_level_0,mean,count,diff,risk_ratio
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328



streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk_ratio
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182



contract


Unnamed: 0_level_0,mean,count,diff,risk_ratio
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473



paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk_ratio
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256



paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk_ratio
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121





## Categorical Feature Importance: Mutual Information

In [426]:
from sklearn.metrics import mutual_info_score

In [427]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [428]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [429]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

## Numerical Feature Importance: Correlation

In [430]:
int(df_full_train.tenure.max())

72

In [431]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

#### Tenure

In [432]:
float(df_full_train[df_full_train.tenure <= 2].churn.mean())

0.5953420669577875

In [433]:
float(df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <= 12)].churn.mean())

0.3994413407821229

In [434]:
float(df_full_train[df_full_train.tenure > 12].churn.mean())

0.17634908339788277

#### Monthly Charges

In [435]:
float(df_full_train[df_full_train.monthlycharges <= 20].churn.mean())

0.08795411089866156

In [436]:
float(df_full_train[(df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges <= 50)].churn.mean())

0.18340943683409436

In [437]:
float(df_full_train[df_full_train.monthlycharges > 50].churn.mean())

0.32499341585462205

# One-hot encoding

In [438]:
from sklearn.feature_extraction import DictVectorizer

In [439]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [440]:
print(df_train.shape)
len(X_train), len(X_train[0])

(4225, 20)


(4225, 45)

# Training Logistic Regression For Binary Classification

In [441]:
from sklearn.linear_model import LogisticRegression

In [442]:
model = LogisticRegression(max_iter=10000) 
# This is a high max_iter compared to the default of 100, but it's necessary for the solver to converge
# other options to try are using a max abs scaler because of the sparse matrix

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [443]:
model.coef_[0].round(3)

array([ 0.675,  0.028, -0.67 ,  0.051, -0.018,  0.111, -0.156,  0.078,
        0.036, -0.003, -0.489,  0.679, -0.156, -0.017, -0.191,  0.073,
        0.151,  0.115, -0.156,  0.075,  0.283, -0.156, -0.093, -0.165,
        0.198, -0.048,  0.08 , -0.055, -0.007,  0.105, -0.01 ,  0.073,
       -0.04 ,  0.195, -0.094, -0.156,  0.283, -0.054, -0.156,  0.243,
        0.233, -0.156, -0.043, -0.069,  0.   ])

In [444]:
#Hard predicitons
model.predict(X_train)

array([0, 1, 1, ..., 1, 0, 1], shape=(4225,))

In [445]:
#Soft predictions
model.predict_proba(X_train)

array([[0.92592804, 0.07407196],
       [0.32455043, 0.67544957],
       [0.36884497, 0.63115503],
       ...,
       [0.4883836 , 0.5116164 ],
       [0.97411627, 0.02588373],
       [0.30558907, 0.69441093]], shape=(4225, 2))

In [446]:
y_pred = model.predict_proba(X_val)[:, 1]


In [447]:
churn_decision = (y_pred >= 0.5)



In [448]:
float((y_val == churn_decision).mean())

0.8026969481902059

# Model Interpretation

In [449]:

dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3).tolist()))


{'contract=month-to-month': 0.675,
 'contract=one_year': 0.028,
 'contract=two_year': -0.67,
 'dependents=no': 0.051,
 'dependents=yes': -0.018,
 'deviceprotection=no': 0.111,
 'deviceprotection=no_internet_service': -0.156,
 'deviceprotection=yes': 0.078,
 'gender=female': 0.036,
 'gender=male': -0.003,
 'internetservice=dsl': -0.489,
 'internetservice=fiber_optic': 0.679,
 'internetservice=no': -0.156,
 'monthlycharges': -0.017,
 'multiplelines=no': -0.191,
 'multiplelines=no_phone_service': 0.073,
 'multiplelines=yes': 0.151,
 'onlinebackup=no': 0.115,
 'onlinebackup=no_internet_service': -0.156,
 'onlinebackup=yes': 0.075,
 'onlinesecurity=no': 0.283,
 'onlinesecurity=no_internet_service': -0.156,
 'onlinesecurity=yes': -0.093,
 'paperlessbilling=no': -0.165,
 'paperlessbilling=yes': 0.198,
 'partner=no': -0.048,
 'partner=yes': 0.08,
 'paymentmethod=bank_transfer_(automatic)': -0.055,
 'paymentmethod=credit_card_(automatic)': -0.007,
 'paymentmethod=electronic_check': 0.105,
 'pay

In [450]:
small = ['contract', 'monthlycharges', 'tenure']

In [451]:
dict_train_small = df_train[small].to_dict(orient='records')
dict_val_small = df_val[small].to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)
dv_small.fit(dict_train_small)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [452]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [453]:
X_train_small = dv_small.transform(dict_train_small)

In [454]:
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [455]:
w0 = float(model_small.intercept_[0])
w0

-2.477957597647984

In [456]:
w = model_small.coef_[0]
w.round(3)

array([ 0.971, -0.024, -0.948,  0.027, -0.036])

In [457]:
dict(zip(dv_small.get_feature_names_out(), w.round(3).tolist()))

{'contract=month-to-month': 0.971,
 'contract=one_year': -0.024,
 'contract=two_year': -0.948,
 'monthlycharges': 0.027,
 'tenure': -0.036}

In [None]:
#probability distribution function
def sigmoid(z):
    return float(1 / (1 + np.exp(-z)))

In [466]:
#Predict user with two year contract, 50 / month and 28 month tenure
-2.48 + 0.971 + 50 * 0.027 + 28 * (-0.036)

-1.1669999999999998

In [465]:
sigmoid(-1.16699999999)

0.23739767639020593

Customer is predicted not to churn. Their probability of churning is 24%

# Using the Model

In [467]:
dict_full_train = df_full_train[categorical + numerical].to_dict(orient='records')
X_full_train = dv.fit_transform(dict_full_train)

y_full_train = df_full_train.churn.values

In [468]:
model = LogisticRegression(max_iter=10000)
model.fit(X_full_train, y_full_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [471]:
dict_test = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.fit_transform(dict_test)

In [472]:
y_pred =model.predict_proba(X_test)[:, 1]

In [473]:
y_pred >= 0.5

array([False, False, False, ..., False, False,  True], shape=(1409,))

In [475]:
churn_decision = (y_pred >= 0.5)
(churn_decision == y_test).mean()

np.float64(0.8126330731014905)