In [1]:
from math import log
from sklearn import datasets
from sklearn.cluster import KMeans
from collections import defaultdict, Counter

In [2]:
def iris_clusters(use_columns=None):
    iris = datasets.load_iris()
    model = KMeans(n_clusters=len(iris.target_names))
    if use_columns is None:
        data = iris.data
    else:
        data = iris.data[:, use_columns]
    model.fit(data)
    return iris.target, model.predict(data)

def entropy(frequencies, base=2):
    return -1 * sum([f/sum(frequencies) * log(f/sum(frequencies), base) for f in frequencies])

def conditional_entropy(y, x, base=2): # computes H(Y|X)
    clusters = defaultdict(lambda: [])
    for yy, xx in zip(y, x):
        clusters[xx].append(yy)
    return sum([len(cluster)/len(x) * entropy(Counter(cluster).values(), base) for cluster in clusters.values()])

def information_gain(p, q, base=2):
    return     entropy(Counter(p).values(), base) - conditional_entropy(p, q, base)
    # same as  entropy(Counter(q).values()) - conditional_entropy(q, p)

for use_columns in [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3]]:
    actuals, modeled = iris_clusters(use_columns)
    print('Using %d columns in feature vector, conditional entropy = %.2f bits, information gain = %.2f bits' % 
          (len(use_columns), conditional_entropy(actuals, modeled), information_gain(actuals, modeled)))

Using 1 columns in feature vector, conditional entropy = 0.99 bits, information gain = 0.60 bits
Using 2 columns in feature vector, conditional entropy = 0.56 bits, information gain = 1.02 bits
Using 3 columns in feature vector, conditional entropy = 0.44 bits, information gain = 1.14 bits
Using 4 columns in feature vector, conditional entropy = 0.39 bits, information gain = 1.19 bits
