In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [37]:
#Data overview
df = pd.read_csv('../../datasets/kaggle/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f'number of rows:{len(df)}')
df.head().T

number of rows:7043


Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [38]:
# Check Types are correct
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [39]:
# We expected TotalCharges would be numeric but is an object
# Lets take care of that
total_charges = pd.to_numeric(df.TotalCharges, errors="coerce")
df[total_charges.isnull()][['customerID','TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


In [40]:
# Lets fill with 0 where function couldn't create a numeric value
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors="coerce")
df.TotalCharges = df.TotalCharges.fillna(0)

In [41]:
# Let’s make it uniform by lowercasing everything and replacing spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,0,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,no
1,5575-gnvde,male,0,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,no
2,3668-qpybk,male,0,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,yes
3,7795-cfocw,male,0,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,no
4,9237-hqitu,female,0,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,yes


In [42]:
# turn our attention to churn column, which is going to be our Target
df.churn = (df.churn == 'yes').astype(int)
df.churn.head()
df.churn.value_counts()

#(df.churn == 'yes')

0    5174
1    1869
Name: churn, dtype: int64

In [43]:
# Separate data for training and testing
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
print(f'rows in train data: {len(df_train_full)}')
print(f'rows in test data: {len(df_test)}')
df_train_full.head()

rows in train data: 5634
rows in test data: 1409


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1814,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
5946,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
3881,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
2389,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
3676,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [44]:
# Let’s take the df_train_full dataframe and split it one more time into train and validation
df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

# Obtain Target feature
y_train = df_train.churn.values
y_val = df_val.churn.values
# Remove target from X features
del df_train['churn']
del df_val['churn']


# Begin Exploratory Data Analysis

In [45]:
#check for any missing values
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [48]:
# Calculate churn rate
global_mean = round(df_train_full.churn.mean(),3)
# The global_mean indicates that this dataset is 'imbalanced'
global_mean

0.27

In [50]:
# Let's divide our data in Numerical and Categorical variables
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [51]:
# Notice that our categorical values have few unique values
# Which is something good as it reduces the amount of cleaning
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature importance: Understanding the important features for our model

#### Churn Rate
It is a good idea to check the churn rate for each categorical variable and compare it with the global churn rate.
If the difference between the rates is small, the value is not important when predicting churn because this group of customers is not really different from the rest of the customers. On the other hand, if the difference is not small, something inside that group sets it apart from the rest. A machine learning algorithm should be able to pick this up and use it when making predictions.

In [56]:
# Let's start with the gender churn rate
female_mean = df_train_full[df_train_full.gender=='female'].churn.mean()
male_mean = df_train_full[df_train_full.gender=='male'].churn.mean()
print(f'female_mean:{female_mean}, male_mean:{male_mean}')
# Then by 'partner' 
partner_yes = df_train_full[df_train_full.partner=='yes'].churn.mean()
partner_no = df_train_full[df_train_full.partner=='no'].churn.mean()
print(f'partner_yes:{partner_yes}, partner_no:{partner_no}')
print(f'global churn rate: {global_mean}')

# we notice that clients with no partner are more likely to churn than the ones with a partner


female_mean:0.27682403433476394, male_mean:0.2632135306553911
partner_yes:0.20503330866025166, partner_no:0.3298090040927694
global churn rate: 0.27


#### Risk Ratio
In statistics, the ratio between probabilities in different groups is called the risk ratio, where risk refers to the risk of having the effect. In our case, the effect is churn, so it’s the risk of churning:

$risk = \frac{group \ rate}{ global \ rate} $

- When Ratio is close to 1: this group has the same level of risk as the rest of the population.
- When Ratio is lower than 1:  the group has lower risks: the churn rate in this group is smaller than the global churn. For example, the value 0.5 means that the clients in this group are two times less likely to churn than clients in general 
- When Ratio is greater than 1: the group is risky: there’s more churn in the group than in the population.

In [58]:
#Lets calculate the risk for each category
for col in categorical:
    df_group = df_train_full.groupby(by=col).churn.agg(['mean']) 
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006824,1.025274
male,0.263214,-0.006786,0.974865


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.02773,0.897297
1,0.413377,0.143377,1.531027


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059809,1.221515
yes,0.205033,-0.064967,0.759383


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.04376,1.162074
yes,0.165666,-0.104334,0.613579


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028684,0.893764
yes,0.273049,0.003049,1.011292


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012593,0.953361
no_phone_service,0.241316,-0.028684,0.893764
yes,0.290742,0.020742,1.07682


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077653,0.712398
fiber_optic,0.425171,0.155171,1.574709
no,0.077805,-0.192195,0.288167


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150921,1.558967
no_internet_service,0.077805,-0.192195,0.288167
yes,0.153226,-0.116774,0.567503


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134323,1.497494
no_internet_service,0.077805,-0.192195,0.288167
yes,0.217232,-0.052768,0.804564


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125875,1.466205
no_internet_service,0.077805,-0.192195,0.288167
yes,0.230412,-0.039588,0.853379


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148914,1.551534
no_internet_service,0.077805,-0.192195,0.288167
yes,0.159926,-0.110074,0.59232


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072832,1.269747
no_internet_service,0.077805,-0.192195,0.288167
yes,0.302723,0.032723,1.121195


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068906,1.255209
no_internet_service,0.077805,-0.192195,0.288167
yes,0.307273,0.037273,1.138047


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161701,1.598893
one_year,0.120573,-0.149427,0.446568
two_year,0.028274,-0.241726,0.104718


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097929,0.6373
yes,0.338151,0.068151,1.252412


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101829,0.622854
credit_card_(automatic),0.164339,-0.105661,0.608661
electronic_check,0.45589,0.18589,1.688482
mailed_check,0.19387,-0.07613,0.718036


We learn that:
- For gender, there is not much difference between females and males. Both means are approximately the same, and for both groups the risks are close to 1.

- Senior citizens tend to churn more than nonseniors: the risk of churning is 1.53 for seniors and 0.89 for nonseniors.

- People with a partner churn less than people with no partner. The risks are 0.75 and 1.22, respectively.

- People who use phone service are not at risk of churning: the risk is close to 1, and there’s almost no difference with the global churn rate. People who don’t use phone service are even less likely to churn: the risk is below 1, and the difference with the global churn rate is negative.

#### Mutual Information

we can measure the degree of dependency between a categorical variable and the target variable. If two variables are dependent, knowing the value of one variable gives us some information about another. On the other hand, if a variable is completely independent of the target variable, it’s not useful and can be safely removed from the dataset.

In [60]:
from sklearn.metrics import mutual_info_score
 
def calculate_mi(series):                                      
    return mutual_info_score(series, df_train_full.churn)      
 
df_mi = df_train_full[categorical].apply(calculate_mi)         
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI') 
# As we see, contract, onlinesecurity, and techsupport are among the most important features
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


#### CORRELATION COEFFICIENT
measure the dependency between a binary target variable and a numerical variable. We can pretend that the binary variable is numerical (containing only the numbers zero and one) and then use the classical methods from statistics to check for any dependency between these variables

In [62]:
df_train_full[numerical].corrwith(df_train_full.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

- The correlation between tenure and churn is –0.35: it has a negative sign, so the longer customers stay, the less often they tend to churn. For customers staying with the company for two months or less, the churn rate is 60%; for customers with tenure between 3 and 12 months, the churn rate is 40%; and for customers staying longer than a year, the churn rate is 17%. So the higher the value of tenure, the smaller the churn rate (figure 3.21A).

- monthlycharges has a positive coefficient of 0.19, which means that customers who pay more tend to leave more often. Only 8% of those who pay less than $20 monthly churned; customers paying between $21 and $50 churn more frequently with a churn rate of 18%; and 32% of people paying more than $50 churned (figure 3.21B).

- totalcharges has a negative correlation, which makes sense: the longer people stay with the company, the more they have paid in total, so it’s less likely that they will leave. In this case, we expect a pattern similar to tenure. For small values, the churn rate is high; for larger values, it’s lower.

# Feature Engineering
### One hot encoding

In [67]:
# Need to create a traiing dictionary
train_dict = df_train[categorical + numerical].to_dict(orient='records')
train_dict

[{'gender': 'male',
  'seniorcitizen': 0,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'no',
  'internetservice': 'dsl',
  'onlinesecurity': 'yes',
  'onlinebackup': 'yes',
  'deviceprotection': 'yes',
  'techsupport': 'yes',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'two_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'bank_transfer_(automatic)',
  'tenure': 71,
  'monthlycharges': 86.1,
  'totalcharges': 6045.9},
 {'gender': 'female',
  'seniorcitizen': 1,
  'partner': 'yes',
  'dependents': 'no',
  'phoneservice': 'yes',
  'multiplelines': 'yes',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'onlinebackup': 'no',
  'deviceprotection': 'yes',
  'techsupport': 'no',
  'streamingtv': 'yes',
  'streamingmovies': 'yes',
  'contract': 'one_year',
  'paperlessbilling': 'yes',
  'paymentmethod': 'credit_card_(automatic)',
  'tenure': 60,
  'monthlycharges': 100.5,
  'totalcharges': 6029.0},
 {'gender':

In [70]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
# Obtain the X_train feature matrix
X_train = dv.transform(train_dict)
X_train.shape

(3774, 45)

# Logistic Regression

In [71]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [73]:
# Use the validation data to obtain a validation data frame
val_dict = df_val[categorical + numerical].to_dict(orient='records') 
X_val = dv.transform(val_dict) 

In [74]:
y_pred = model.predict_proba(X_val)
y_pred

array([[0.76508733, 0.23491267],
       [0.73112969, 0.26887031],
       [0.6805478 , 0.3194522 ],
       ...,
       [0.94274623, 0.05725377],
       [0.38476886, 0.61523114],
       [0.93872731, 0.06127269]])