## Filter Method of Feature Selection

Kulback Leibler Divergence

In [3]:
import numpy as np
import pandas as pd
from scipy.special import rel_entr

In [4]:
# example of calculating the kl divergence (relative entropy)

# define distributions
p = np.array([0.10, 0.40, 0.50])
q = np.array([0.80, 0.15, 0.05])

In [5]:
def KL_Divergence(p, q, epsilon=10e-10):
    p = (p + epsilon)/np.sum(p + epsilon) # Epsilon to avoid division by zero
    
    q = (q + epsilon)/np.sum(q + epsilon)
    return np.sum(p*np.log(p/q))

print(KL_Divergence(p, q))
print(KL_Divergence(q, p)) # Not symmetric

1.3356800809389957
1.401299579859536


In [6]:
# confirm with scipy
# Difference cos input has zero
kl_pq = rel_entr(p, q)
print('KL(P || Q): %.3f nats' % sum(kl_pq))

kl_qp = rel_entr(q, p)
print('KL(Q || P): %.3f nats' % sum(kl_qp))

KL(P || Q): 1.336 nats
KL(Q || P): 1.401 nats


In [7]:
df = pd.read_csv('diabetes.csv')

In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
# List predictor colomns to compare with target
predictors = df.columns[:-1].copy(); target = df.columns[-1]
predictors = list(predictors);

In [13]:
def MutualInformation(X, y, predictor_cols, target_col, epsilon=10e-10):
    """
    Mutual Information Calculated using Kulback Leibler Divergence
    
    inputs
    ..................
    X: Pandas dataframe of columns
    y: Pandas dataframe of target
    epsilon: small number to prevent division by 0; default:10e-10
    predictor_cols: predictor column names
    target_col: target name
    output
    ...................
    Pandas dataframe of column combinations and KL divergence score
    """
    def KL_Divergence(p, q, epsilon=epsilon):
        p = (p + epsilon)/np.sum(p + epsilon)

        q = (q + epsilon)/np.sum(q + epsilon)
        return np.sum(p*np.log(p/q))

    X = X.values; y = y.values
    KL_list = {}
    for i, col in enumerate(predictor_cols):
        array_index = col + '||' + target_col
        p = X[:, i]
        q = y
        KL_list[array_index] = KL_Divergence(p, q, epsilon)

    sorted_KL_list = sorted(KL_list.items(), key=lambda x: x[1], reverse=True)

    return pd.DataFrame(sorted_KL_list, columns=['Features', 'MI-Score (nats)'])


In [14]:
MutualInformation(df.iloc[:, :-1], df.iloc[:,-1], predictors, target)

BloodPressure||Outcome


Unnamed: 0,Features,MI-Score (nats)
0,BloodPressure||Outcome,12.320437
1,SkinThickness||Outcome,12.280656
2,BMI||Outcome,11.763647
3,Age||Outcome,11.664003
4,Insulin||Outcome,11.480395
5,DiabetesPedigreeFunction||Outcome,11.441139
6,Glucose||Outcome,11.258261
7,Pregnancies||Outcome,10.923133
