In [150]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import math
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

## Exploratory analysis

In [151]:
data = pd.read_csv("../Data/ACME-HappinessSurvey2020.csv")

- X1: order delivered on time
- X2: contents were as expected
- X3: I ordered everything I wanted to order
- X4: I paid a good price
- X5: I am satisfied with the courier
- X6: the app is easy to order

In [152]:
feature_names = [
    "order delivered on time",
    "contents were as expected",
    "I ordered everything I wanted to order",
    "I paid a good price",
    "I am satisfied with the courier",
    "the app is easy to order"
]

In [153]:
data.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,0,3,3,3,4,2,4
1,0,3,2,3,5,4,3
2,1,5,3,3,3,3,5
3,0,5,4,3,3,3,5
4,0,5,4,3,3,3,5


In [154]:
data["Y"].count()

126

In [155]:
data["Y"].value_counts()

1    69
0    57
Name: Y, dtype: int64

In [156]:
missing_cols = [col for col in data.columns if data[col].isnull().any()]
len(missing_cols)

0

The exploratory analysis concluded that the target variable is nearly evenly split between the two classes; there are only 126 rows; there is no missing data.

All the features are ordinal!

## Mutual Information

In [302]:
y = data["Y"]

In [316]:
feature_names = list(data.columns)
feature_names.remove("Y")
X = data[feature_names]


### Implementation

In [319]:
ct = pd.crosstab(data["Y"], data["X1"])
jp = ct.to_numpy()
jp = jp / len(X)

In [305]:
jp

array([[0.        , 0.00793651, 0.11111111, 0.15873016, 0.17460317],
       [0.00793651, 0.        , 0.04761905, 0.21428571, 0.27777778]])

In [306]:
def joint_entropy(x, y, p):
    sum = 0
    for x in range(x):
        for y in range(y):
            if p[x][y] == 0:
                
                continue
            sum += -(p[x][y] * np.log2((p[x][y])))
    
    return sum

In [309]:
def entropy(x):
    
    sum = 0
    for i in x:
        sum -= i * np.log2(i)
    return sum

In [307]:
h_x_y = joint_entropy(jp.shape[0], jp.shape[1], jp)

In [308]:
xpmf = jp.sum(axis=0)
ypmf = jp.sum(axis=1)

In [310]:
h_x = entropy(xpmf)
h_y = entropy(ypmf)

In [311]:
cond_x_y = h_x_y - h_y
cond_y_x = h_x_y - h_x

In [312]:
i_x_y = h_x - cond_x_y

In [313]:
i_x_y

0.5646106714581822

In [314]:
i_x_y = h_y - cond_y_x

In [315]:
i_x_y

0.5646106714581822

In [197]:
ct = pd.crosstab(data["Y"], data["X1"])
jp = ct.to_numpy()

yn = jp.shape[0]
xn = jp.shape[1]

ys = jp.sum(1)
xs = jp.sum(0)

total = 126
mi = 0
h_x = 0
h_y = 0

for y in range(0, yn):
    for x in range(0, xn):

        p_x = xs[x] / total
        p_y = ys[y] / total
        p_xy = jp[y, x] / total

        fraction = p_xy / ((p_x * p_y) + 10e-8)

        if fraction == 0.0:
            continue
    
        mi += p_xy * np.log(fraction)

print(f"Mutual information is : {mi}")

for x in range(0, xn):
    p_x = xs[x] / total
    h_x += p_x * np.log(p_x)

print(f"Entropy : {h_x}")

for y in range(0, yn):

    p_y = ys[y] / total
    h_y += p_y * np.log(p_y)

print(f"Entropy : {h_y}")

Mutual information is : 0.04970420601280024
Entropy : -1.0362422547383536
Entropy : -0.6886051523183013


In [188]:
jp.sum(0)

array([ 7, 14, 55, 33, 17])

In [194]:
jp

array([[ 1, 12, 24, 20],
       [ 0,  8, 16, 45]])

### Feature Selection

In [342]:
import numpy as np
import pandas as pd


class CategoricalMI:

    def __init__(self, data, target_name, feature_names):

        self.y = data[target_name]
        self.feature_names = feature_names
        self.X = data[feature_names]
        self.X = self.X.astype(str)
        self.y = self.y.astype(str)
        self.nrows = len(self.X)
        self.mis = {}


    def joint_entropy(self, x, y, p):
        sum = 0
        for x in range(x):
            for y in range(y):
                if p[x][y] == 0:

                    continue
                sum += -(p[x][y] * np.log2((p[x][y])))

        return sum

    def entropy(self, x):

        sum = 0
        for i in x:
            sum -= i * np.log2(i)
        return sum

    def creat_joint_pmt(self, fname):

        ct = pd.crosstab(self.y, self.X[fname])
        jp = np.array(ct)
        jp = jp / self.nrows

        return jp

    def run(self):

        mi_scores = []

        for fname in self.feature_names:

            jpmf = self.creat_joint_pmt(fname)
            h_x_y = self.joint_entropy(jpmf.shape[0], jpmf.shape[1], jpmf)

            xpmf = jpmf.sum(axis=0)
            ypmf = jpmf.sum(axis=1)

            h_x = self.entropy(xpmf)
            h_y = self.entropy(ypmf)

            cond_x_y = h_x_y - h_y
            cond_y_x = h_x_y - h_x

            i_x_y = h_x - cond_x_y

            mi_scores.append(i_x_y)
        
        mi_scores = sorted(zip(mi_scores, self.feature_names), key=lambda x : x[0], reverse=True)
        self.mis = {fn: mi for mi, fn in mi_scores}

    def report(self):

        for fname, mi in self.mis.items():
            
            print(f"{fname} mutual information with target : {mi}")


In [343]:
feature_names

['X1', 'X2', 'X3', 'X4', 'X5', 'X6']

In [344]:
cat_mi = CategoricalMI(data, "Y", feature_names)

In [345]:
cat_mi.run()

In [346]:
cat_mi.report()

X1 mutual information with target : 0.6022187855318999
X6 mutual information with target : 0.5646106714581822
X5 mutual information with target : 0.4794815437913742
X4 mutual information with target : 0.39708921405515984
X3 mutual information with target : 0.3322625056186468
X2 mutual information with target : 0.17166808026614677
