In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import helper as h

In [10]:
df = h.split_telco_data()

In [6]:
from importlib import reload

In [7]:
reload(h)

<module 'helper' from '/Users/nadina/Documents/Zoomcapm/ML-Zoomcamp/week3/helper.py'>

In [11]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,female,male,male,male
seniorcitizen,0,0,0,0,0
partner,no,no,yes,no,no
dependents,yes,no,no,no,yes
tenure,21,54,1,4,0
phoneservice,yes,yes,no,yes,yes
multiplelines,no,yes,no_phone_service,no,yes
internetservice,dsl,fiber_optic,dsl,fiber_optic,dsl
onlinesecurity,yes,no,no,no,yes
onlinebackup,no,yes,no,no,yes


#### Feature importance

In [14]:
df.churn.mean().round(2)

0.27

We need to check how different is the mean of each feature from the target var `churn`. We can do it manually, or through `for` loop or using `mutual_info_score`

In [19]:
# gender not much different from the global mean
df.query('gender == "female"').churn.mean()

0.2708409173643975

In [20]:
df.query('gender == "male"').churn.mean()

0.26047800484932454

In [22]:
# check partner
df.query('partner == "yes"').churn.mean()

0.20073260073260074

In [23]:
# partner's mean difference and ratio with global mean
gm = df.churn.mean()
pm = df.query('partner == "yes"').churn.mean()
print('Difference: ', abs(gm - pm))
print('Ratio: ', pm / gm)

Difference:  0.06479810569267436
Ratio:  0.7559675618499149


In [24]:
reload(h)

<module 'helper' from '/Users/nadina/Documents/Zoomcapm/ML-Zoomcamp/week3/helper.py'>

In [26]:
categorical = h.get_categorical()
categorical[:3]

['gender', 'seniorcitizen', 'partner']

In [28]:
for col in categorical:
    df_group = df.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - gm
    df_group['risk'] = df_group['mean'] / gm
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.270841,0.00531,1.019998
male,0.260478,-0.005053,0.980971


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.237098,-0.028433,0.892922
1,0.413907,0.148377,1.558793


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.326446,0.060916,1.229411
yes,0.200733,-0.064798,0.755968


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.312326,0.046795,1.176233
yes,0.155674,-0.109856,0.586276


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.253623,-0.011908,0.955156
yes,0.266824,0.001293,1.004871


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.251397,-0.014134,0.946771
no_phone_service,0.253623,-0.011908,0.955156
yes,0.284105,0.018574,1.069952


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.191851,-0.073679,0.722521
fiber_optic,0.415558,0.150028,1.56501
no,0.076606,-0.188924,0.288502


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.416014,0.150484,1.566727
no_internet_service,0.076606,-0.188924,0.288502
yes,0.145342,-0.120189,0.547363


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.398693,0.133162,1.501494
no_internet_service,0.076606,-0.188924,0.288502
yes,0.216531,-0.048999,0.815467


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.387706,0.122175,1.460117
no_internet_service,0.076606,-0.188924,0.288502
yes,0.226825,-0.038705,0.854234


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.413472,0.147941,1.557153
no_internet_service,0.076606,-0.188924,0.288502
yes,0.152855,-0.112676,0.575657


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.335418,0.069887,1.263197
no_internet_service,0.076606,-0.188924,0.288502
yes,0.298945,0.033415,1.125841


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.333333,0.067803,1.255348
no_internet_service,0.076606,-0.188924,0.288502
yes,0.30132,0.035789,1.134784


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.426533,0.161002,1.60634
one_year,0.117987,-0.147544,0.444343
two_year,0.028379,-0.237151,0.106878


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.16414,-0.10139,0.618159
yes,0.33594,0.070409,1.265164


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.174475,-0.091056,0.65708
credit_card_(automatic),0.152404,-0.113126,0.573961
electronic_check,0.449921,0.18439,1.69442
mailed_check,0.190328,-0.075203,0.716782


In [29]:
def calculate_mi(series):
    return mutual_info_score(series, df.churn)

In [34]:
# most important categorical features are contrac / online security / tech support
df[categorical].apply(calculate_mi).sort_values(ascending=False)

contract            0.096652
onlinesecurity      0.063393
techsupport         0.060935
internetservice     0.053313
onlinebackup        0.045424
paymentmethod       0.042861
deviceprotection    0.042007
streamingtv         0.030844
streamingmovies     0.030705
paperlessbilling    0.019077
dependents          0.014129
partner             0.010227
seniorcitizen       0.010059
multiplelines       0.000654
gender              0.000069
phoneservice        0.000040
dtype: float64

Let's compare with Chi-squared test

In [37]:
reload(h)

<module 'helper' from '/Users/nadina/Documents/Zoomcapm/ML-Zoomcamp/week3/helper.py'>

In [47]:
h.get_p_values(df, categorical)

Unnamed: 0,Feature,P_value,is_significant
13,contract,6.193927e-203,True
7,onlinesecurity,1.016256e-145,True
10,techsupport,5.145607e-140,True
6,internetservice,7.180814e-123,True
15,paymentmethod,6.781539e-108,True
8,onlinebackup,1.412204e-102,True
9,deviceprotection,1.464842e-93,True
11,streamingtv,7.352374e-64,True
12,streamingmovies,1.7453929999999998e-63,True
14,paperlessbilling,1.419575e-46,True


In [49]:
# compare p-value vs mutual info score
pd.concat([
    h.get_p_values(df, categorical).set_index('Feature'),
    df[categorical].apply(calculate_mi).sort_values(ascending=False)],
    axis = 1
    )

Unnamed: 0,P_value,is_significant,0
contract,6.193927e-203,True,0.096652
onlinesecurity,1.016256e-145,True,0.063393
techsupport,5.145607e-140,True,0.060935
internetservice,7.180814e-123,True,0.053313
paymentmethod,6.781539e-108,True,0.042861
onlinebackup,1.412204e-102,True,0.045424
deviceprotection,1.464842e-93,True,0.042007
streamingtv,7.352374e-64,True,0.030844
streamingmovies,1.7453929999999998e-63,True,0.030705
paperlessbilling,1.419575e-46,True,0.019077
