# Feature Subset Evaluation using Divergence Measures

## 1. Introduction
This notebook evaluates the newly generated feature subset using various divergence measures. These measures assess the difference between class-conditional probability densities, which helps determine the goodness of the feature subset.

## 2. Data Loading and Overview

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score

file_path = '../data/processed_data.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,DX_GROUP,AGE_AT_SCAN,SEX,FIQ,VIQ,PIQ,EYE_STATUS_AT_SCAN,HANDEDNESS_L,HANDEDNESS_R,CURRENT_MED_STATUS_1
0,1,24.45,1,124.0,128.0,115.0,2,0,1,1.0
1,1,19.09,1,113.0,108.0,117.0,2,0,1,0.0
2,1,13.73,2,119.0,117.0,118.0,2,0,1,1.0
3,1,13.37,1,109.0,99.0,119.0,2,1,0,0.0
4,1,17.78,1,110.0,106.0,112.0,2,0,1,0.0


In [2]:
data.info()
data.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479 entries, 0 to 478
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   DX_GROUP              479 non-null    int64  
 1   AGE_AT_SCAN           479 non-null    float64
 2   SEX                   479 non-null    int64  
 3   FIQ                   479 non-null    float64
 4   VIQ                   479 non-null    float64
 5   PIQ                   479 non-null    float64
 6   EYE_STATUS_AT_SCAN    479 non-null    int64  
 7   HANDEDNESS_L          479 non-null    int64  
 8   HANDEDNESS_R          479 non-null    int64  
 9   CURRENT_MED_STATUS_1  479 non-null    float64
dtypes: float64(5), int64(5)
memory usage: 37.6 KB


Unnamed: 0,DX_GROUP,AGE_AT_SCAN,SEX,FIQ,VIQ,PIQ,EYE_STATUS_AT_SCAN,HANDEDNESS_L,HANDEDNESS_R,CURRENT_MED_STATUS_1
count,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0,479.0
mean,1.538622,16.081138,1.175365,109.601253,109.931106,107.697286,1.240084,0.058455,0.580376,0.141962
std,0.499027,5.766685,0.380677,13.76091,14.536401,14.766318,0.427581,0.234847,0.494013,0.349376
min,1.0,6.47,1.0,72.0,73.0,59.0,1.0,0.0,0.0,0.0
25%,1.0,12.365,1.0,100.0,99.0,99.0,1.0,0.0,0.0,0.0
50%,2.0,14.7,1.0,109.0,109.0,108.0,1.0,0.0,1.0,0.0
75%,2.0,18.595,1.0,119.0,119.0,118.0,1.0,0.0,1.0,0.0
max,2.0,39.1,2.0,148.0,180.0,155.0,2.0,1.0,1.0,1.0


In [3]:
X = data.drop(columns=['DX_GROUP'])
y = data['DX_GROUP']

X_encoded = pd.get_dummies(X, drop_first=True)

In [4]:
from scipy.stats import entropy

def kl_divergence(p, q):
    return entropy(p, q)

p = np.random.rand(100)
q = np.random.rand(100)
p /= p.sum()
q /= q.sum()
kl_div = kl_divergence(p, q)
print(f"KL Divergence: {kl_div}")

KL Divergence: 0.5198352089679386


In [5]:
def bhattacharyya_distance(p, q):
    return -np.log(np.sum(np.sqrt(p * q)))

bhat_dist = bhattacharyya_distance(p, q)
print(f"Bhattacharyya Distance: {bhat_dist}")

Bhattacharyya Distance: 0.11757255337875054


In [6]:
def jeffreys_divergence(p, q):
    return np.sum((p - q) * (np.log2(p / q) - np.log2(q / p)))

jeff_div = jeffreys_divergence(p, q)
print(f"Jeffreys Divergence: {jeff_div}")

Jeffreys Divergence: 2.8817703784606685


In [7]:
def matusita_distance(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q))**2))

matusita_dist = matusita_distance(p, q)
print(f"Matusita Distance: {matusita_dist}")

Matusita Distance: 0.4710074225778294


In [8]:
def kagan_divergence(p, q):
    return 0.5 * np.sum(((p - q)**2) / p)

kagan_div = kagan_divergence(p, q)
print(f"Kagan's Divergence: {kagan_div}")

Kagan's Divergence: 1.2682614888721016


In [11]:
import numpy as np
from scipy.stats import entropy
import math

# Separate FIQ values based on DX_GROUP
group_1_fiq = data[data['DX_GROUP'] == 1]['FIQ'].dropna()
group_2_fiq = data[data['DX_GROUP'] == 2]['FIQ'].dropna()

# Create histograms to estimate probability distributions
hist_1, bin_edges_1 = np.histogram(group_1_fiq, bins=10, density=True)
hist_2, bin_edges_2 = np.histogram(group_2_fiq, bins=10, density=True)

# Normalize histograms to ensure they sum to 1 (probability distributions)
hist_1 = hist_1 / np.sum(hist_1)
hist_2 = hist_2 / np.sum(hist_2)

# Define functions to calculate the required divergences and distances
def kl_divergence(p, q):
    return entropy(p, q)

def bhattacharyya_distance(p, q):
    return -np.log(np.sum(np.sqrt(p * q)))

def jeffreys_divergence(p, q):
    return np.sum((p - q) * np.log(p / q))

def matusita_distance(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q))**2))

def kagan_divergence(p, q):
    return np.sum(p * np.log(p / q)) + np.sum(q * np.log(q / p))

# Calculate the divergences and distances
kl_div = kl_divergence(hist_1, hist_2)
bhattacharyya_dist = bhattacharyya_distance(hist_1, hist_2)
jeffreys_div = jeffreys_divergence(hist_1, hist_2)
matusita_dist = matusita_distance(hist_1, hist_2)
kagan_div = kagan_divergence(hist_1, hist_2)

# Create a dataframe to display the results
results = pd.DataFrame({
    'Measure': ['KL Divergence', 'Bhattacharyya Distance', 'Jeffreys Divergence', 'Matusita Distance', 'Kagan Divergence'],
    'Value': [kl_div, bhattacharyya_dist, jeffreys_div, matusita_dist, kagan_div]
})

results


Unnamed: 0,Measure,Value
0,KL Divergence,0.076803
1,Bhattacharyya Distance,0.0185
2,Jeffreys Divergence,0.148091
3,Matusita Distance,0.191465
4,Kagan Divergence,0.148091
