In [1]:
import numpy as np
import pandas as pd

In [2]:
datafile = "../../data/processed/bc.csv"
df = pd.read_csv(datafile, index_col="Sample")

In [3]:
df.columns

Index(['Clump Thickness', 'Cell Size Uniformity', 'Cell Shape Uniformity',
       'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei',
       'Bland Chromatin', 'Normal Nuclei', 'Mitoses', 'Class'],
      dtype='object')

In [4]:
df.dtypes

Clump Thickness                int64
Cell Size Uniformity           int64
Cell Shape Uniformity          int64
Marginal Adhesion              int64
Single Epithelial Cell Size    int64
Bare Nuclei                    int64
Bland Chromatin                int64
Normal Nuclei                  int64
Mitoses                        int64
Class                          int64
dtype: object

In [5]:
y = df['Class']
x = df[df.columns[:-1]]

In [6]:
y = pd.get_dummies(y)

In [7]:
def entropy(p_col):
    """Calculate the entropy of a column of labels
    
    The column should be a 2-class classification output, with
    the label `1` if it is a member of the class or `0` if it
    is not.
    
    Parameters
    ----------
    p_col : numpy.ndarray
        Array of one-hot labels. Can optionally be a pandas.Series
    
    Returns
    -------
    float
        Entropy of the array
    """
    if p_col.shape[0] == 0:
        p = 0.0
    else:
        p = np.sum(p_col) / p_col.shape[0]
    if p == 0.0:
        fp = 0.0
    else:
        fp = -p * np.log(p) / np.log(2)
    if 1-p == 0.0:
        sp = 0.0
    else:
        sp = (1 - p) * np.log(1 - p) / np.log(2)
    return fp - sp

In [8]:
assert entropy(y[2]) == entropy(y[4])

In [9]:
def gini(p_col):
    if p_col.shape[0] == 0:
        p = 0.0
    else:
        p = np.sum(p_col) / p_col.shape[0]
    return 2 * p * (1 - p)

In [10]:
assert gini(y[2]) == gini(y[4])

In [11]:
def misclassification(p_col):
    if p_col.shape[0] == 0:
        p = 0.0
    else:
        p = np.sum(p_col) / p_col.shape[0]
    return 1 - np.max([p, 1-p])

In [12]:
assert misclassification(y[2]) == misclassification(y[4])

In [13]:
df.head()

Unnamed: 0_level_0,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nuclei,Mitoses,Class
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2
1016277,6,8,8,1,3,4,3,7,1,2
1017023,4,1,1,3,2,1,3,1,1,2


In [14]:
np.where(x['Bare Nuclei'] % 1 != 0.0)

(array([], dtype=int64),)

In [15]:
x['Clump Thickness'].unique()

array([ 5,  3,  6,  4,  8,  1,  2,  7, 10,  9])

In [16]:
def split_attribute(x, y, impurity=entropy):
    min_entropy = np.finfo(np.float64).max # 1e308 on test system
    columns = x.columns
    xt = x.copy()
    xt['Labels'] = y
    split_column = None
    split_value = None
    for c in x.columns:
        for v in x[c].unique():
            left_split = xt[xt[c] <= v]
            right_split = xt[xt[c] > v]
            left_impurity = impurity(left_split['Labels'])
            right_impurity = impurity(right_split['Labels'])
            e = left_impurity + right_impurity
            if e < min_entropy:
                min_entropy = e
                split_column = c
                split_value = v
    return split_column, split_value

In [17]:
split_attribute(x, y[2], impurity=misclassification)

('Cell Size Uniformity', 3)

In [18]:
split_attribute(x, y[2], impurity=gini)

('Cell Size Uniformity', 3)

In [19]:
split_attribute(x, y[2], impurity=entropy)

('Cell Size Uniformity', 4)