# Mutual Info

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score

In [13]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [14]:
# load the data
df = pd.read_csv(data)

df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [15]:
# replace column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# replace categorical column values to lowercase and replace spaces with underscores
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [16]:
y_target = df.churn.map({'yes':1, 'no':0})

In [17]:
dfenc = pd.get_dummies(df[['gender','partner','phoneservice','paymentmethod']])

In [18]:
categorical_columns = [
'gender',
'partner',
'dependents',
'phoneservice',
'multiplelines',
'internetservice',
'onlinesecurity',
'onlinebackup',
'deviceprotection',
'techsupport',
'streamingtv',
'streamingmovies',
'contract',
'paperlessbilling',
'paymentmethod'
]

numeric_columns = [
    'seniorcitizen',
    'tenure',
    'monthlycharges',
    'totalcharges'
]


In [19]:
y_target = df.churn

In [20]:
# get small columns for testing
dfsmall = df[['gender','partner','phoneservice','paymentmethod']]

In [21]:
# calculate mutual information
mi_scores_small = []
for col in dfsmall.columns:
    mi = mutual_info_score(dfsmall[col], y_target)
    mi_scores_small.append((col, mi))
    print(f'{col}: {mi}')


gender: 3.7082914405128786e-05
partner: 0.011453657253317984
phoneservice: 7.215949186982484e-05
paymentmethod: 0.044518668630902994


In [22]:
# sort by mutual information score
mi_scores_small.sort(key=lambda x: x[1], reverse=True)

mi_scores_small

[('paymentmethod', 0.044518668630902994),
 ('partner', 0.011453657253317984),
 ('phoneservice', 7.215949186982484e-05),
 ('gender', 3.7082914405128786e-05)]

Mutual information:
* mutual_info_score is used only for categorical variable (both features and target).
* With mutual info score, if I know one variable, I can make a better guess about the other.

Example:
* Rain and umbrella. When it's raining, you can guess people will use umbrella.
* Temparature and ice sales. When temperature is high, you can guess ice sales increase.

mutual_info_score measure dependency strength, not direction. In contrast, correlation show both strength and direction (+/-).

Let's say, a feature has a high mutual_info_score and negative correlation. It means the feature has a strong relationship with the target, but the relationship is inverse (as feature increase, target tends to decrease).

Meaning of Mutual Info Score:
* MI tells how much knowing feature X reduces uncertainty about target Y.
* If mi score = 0, feature and target are independent.
* If mi > 0, the more informative the feature is about the target.

In sample above:
* paymentmethod	0.0445, the biggest score relationship with the target compare with the others. It provides the most information about predicting the target.
* partner 0.0115, weak relationship, gives some information, but limited.
* phoneservice 0.000072, almost no relationship, not helpful for predicting the target.
* gender 0.000037, almost no relationship, not helpful for predicting the target.
* paymentmethod have more effect to target (churn).
* partner, phoneservice, and gender don’t significantly influence to churn.



try calculate mi for all categorical column

In [23]:
# calculate mutual information
mi_scores = []
for col in categorical_columns:
    mi = mutual_info_score(df[col], y_target)
    mi_scores.append((col, mi))
    print(f'{col}: {mi}')


gender: 3.7082914405128786e-05
partner: 0.011453657253317984
dependents: 0.014467261139424592
phoneservice: 7.215949186982484e-05
multiplelines: 0.0008012658524292199
internetservice: 0.05557418477268879
onlinesecurity: 0.06467728245735829
onlinebackup: 0.04679232253922637
deviceprotection: 0.04391690927485155
techsupport: 0.06302103606897548
streamingtv: 0.031907975162527094
streamingmovies: 0.03200094959522297
contract: 0.09845305342598942
paperlessbilling: 0.019194399646111526
paymentmethod: 0.044518668630902994


In [24]:
# sort by mutual information score
mi_scores.sort(key=lambda x: x[1], reverse=True)

mi_scores

[('contract', 0.09845305342598942),
 ('onlinesecurity', 0.06467728245735829),
 ('techsupport', 0.06302103606897548),
 ('internetservice', 0.05557418477268879),
 ('onlinebackup', 0.04679232253922637),
 ('paymentmethod', 0.044518668630902994),
 ('deviceprotection', 0.04391690927485155),
 ('streamingmovies', 0.03200094959522297),
 ('streamingtv', 0.031907975162527094),
 ('paperlessbilling', 0.019194399646111526),
 ('dependents', 0.014467261139424592),
 ('partner', 0.011453657253317984),
 ('multiplelines', 0.0008012658524292199),
 ('phoneservice', 7.215949186982484e-05),
 ('gender', 3.7082914405128786e-05)]

In [None]:
# another way to calculate mutual information, using apply

def calc_mi_score(series):
    return mutual_info_score(series, y_target)

df[categorical_columns].apply(calc_mi_score).sort_values(ascending=False)

contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64