In [12]:
import numpy as np
import pandas as pd
from scipy.stats import mode

In [13]:
datafile = "../../data/processed/bc.csv"
df = pd.read_csv(datafile, index_col='Sample')
cols = df.columns
x = df[cols[:-1]]
y = pd.get_dummies(df[cols[-1]])[2]

In [14]:
x.head()

Unnamed: 0_level_0,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nuclei,Mitoses
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1000025,5,1,1,1,2,1,3,1,1
1002945,5,4,4,5,7,10,3,2,1
1015425,3,1,1,1,2,2,3,1,1
1016277,6,8,8,1,3,4,3,7,1
1017023,4,1,1,3,2,1,3,1,1


In [15]:
y.head()

Sample
1000025    1
1002945    1
1015425    1
1016277    1
1017023    1
Name: 2, dtype: uint8

In [16]:
y.value_counts()

1    444
0    239
Name: 2, dtype: int64

In [17]:
cc = [x[c].value_counts() for c in x.columns]
for _ in cc:
    print(_)

1     139
5     128
3     104
4      79
10     69
2      50
8      44
6      33
7      23
9      14
Name: Clump Thickness, dtype: int64
1     373
10     67
3      52
2      45
4      38
5      30
8      28
6      25
7      19
9       6
Name: Cell Size Uniformity, dtype: int64
1     346
10     58
2      58
3      53
4      43
5      32
7      30
6      29
8      27
9       7
Name: Cell Shape Uniformity, dtype: int64
1     393
3      58
2      58
10     55
4      33
8      25
5      23
6      21
7      13
9       4
Name: Marginal Adhesion, dtype: int64
2     376
3      71
4      48
1      44
6      40
5      39
10     31
8      21
7      11
9       2
Name: Single Epithelial Cell Size, dtype: int64
1     402
10    132
5      30
2      30
3      28
8      21
4      19
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64
3     161
2     160
1     150
7      71
4      39
5      34
8      28
10     20
9      11
6       9
Name: Bland Chromatin, dtype: int64
1     432
10     60
3      

In [18]:
def entropy(p_col):
    """Calculate the entropy of a column of labels
    
    The column should be a 2-class classification output, with
    the label `1` if it is a member of the class or `0` if it
    is not.
    
    Parameters
    ----------
    p_col : numpy.ndarray
        Array of one-hot labels. Can optionally be a pandas.Series
    
    Returns
    -------
    float
        Entropy of the array
    """
    if p_col.shape[0] == 0:
        p = 0.0
    else:
        p = np.sum(p_col) / p_col.shape[0]
    if p == 0.0:
        fp = 0.0
    else:
        fp = -p * np.log(p) / np.log(2)
    if 1-p == 0.0:
        sp = 0.0
    else:
        sp = (1 - p) * np.log(1 - p) / np.log(2)
    return fp - sp

In [19]:
def split_attribute(x, y, impurity=entropy):
    min_entropy = np.finfo(np.float64).max # 1e308 on test system
    columns = x.columns
    xt = x.copy()
    xt['Labels'] = y
    split_column = None
    split_value = None
    for c in x.columns:
        for v in x[c].unique():
            left_split = xt[xt[c] <= v]
            right_split = xt[xt[c] > v]
            left_impurity = impurity(left_split['Labels'])
            right_impurity = impurity(right_split['Labels'])
            e = left_impurity + right_impurity
            if e < min_entropy:
                min_entropy = e
                split_column = c
                split_value = v
    return split_column, split_value

In [20]:
max_leaves = 10
min_impurity = 0.10

In [132]:
class Node():
    def __init__(self, impurity=entropy, min_impurity=0.1, depth=0, max_depth=4):
        self.impurity = impurity
        self.attribute_ = None
        self.value_ = None
        self.min_impurity = min_impurity
        self.depth = depth
        self.max_depth = max_depth
        self.left_ = None
        self.right_ = None
        self.is_leaf_ = False
        self.label_ = None
        self.confidence_ = None
        self.training_impurity_ = None
    
    def fit(self, data):
        column_names = data.columns
        x = data[column_names[:-1]]
        y = data[column_names[-1]]
        # Calculate impurity
        impurity = self.impurity(y)
        self.training_impurity_ = impurity
        # Decide if I should split
        if impurity > self.min_impurity and self.depth < self.max_depth:
            # Calculate the split
            self.attribute_, self.value_ = split_attribute(x, y, impurity=self.impurity)
            self.left_ = Node(impurity=self.impurity, min_impurity=self.min_impurity,
                              depth=self.depth+1, max_depth=self.max_depth)
            self.right_ = Node(impurity=self.impurity, min_impurity=self.min_impurity,
                               depth=self.depth+1, max_depth=self.max_depth)
            left_data = data[data[self.attribute_] <= self.value_]
            right_data = data[data[self.attribute_] > self.value_]
            self.left_.fit(left_data)
            self.right_.fit(right_data)
        else:
            self.is_leaf_ = True
            self.label_ = mode(y)
            self.confidence_ = self.impurity(y)
    
    def predict(self, x):
        if self.is_leaf_:
            return self.label_.mode
        else:
            # Determine left or right
            if x[self.attribute_] <= self.value_:
                a = self.left_.predict(x)
                return a
            else:
                a = self.right_.predict(x)
                return a
    
    def print_node(self):
        if self.is_leaf_:
            return self.__repr__()
        else:
            return {self.__repr__():
                       [
                           self.left_.print_node(),
                           self.right_.print_node()
                       ]
                   }
    
    def html_print(self):
        s = "<table border=1 style=\"text-align:center\"><tr style=\"text-align:center\">"
        if self.is_leaf_:
            s += "<td style=\"text-align:center\" bgcolor=\"green\">"
            s += self.__repr__()
            s += "</td>"
        else:
            s += "<td colspan=2 style=\"text-align:center\">"
            s += self.__repr__()
            s += "</td>"
            s += "</tr><tr>"
            s += "<td style=\"text-align:center\" width=50%>"
            s += self.left_.html_print()
            s += "</td><td style=\"text-align:center\" width=50%>"
            s += self.right_.html_print()
            s += "</td>"
        s += "</tr></table>"
        return s
    
    def __repr__(self):
        if self.is_leaf_:
            return f"Leaf with impurity {self.training_impurity_:0.2f}"
        else:
            return f"Node with impurity {self.training_impurity_:0.2f}, "\
                   f"split on \"{self.attribute_}\" with value {self.value_}"


In [133]:
n = Node()

In [134]:
xt = x.copy()
xt['Label'] = y
n.fit(xt)

In [135]:
n

Node with impurity 0.93, split on "Cell Size Uniformity" with value 4

In [136]:
_ = n
while True:
    if _ is not None:
        print(_)
        _ = _.right_
    else:
        break

Node with impurity 0.93, split on "Cell Size Uniformity" with value 4
Node with impurity 0.13, split on "Clump Thickness" with value 10
Leaf with impurity 0.00


In [137]:
_ = n
while True:
    if _ is not None:
        print(_)
        _ = _.left_
    else:
        break

Node with impurity 0.93, split on "Cell Size Uniformity" with value 4
Node with impurity 0.56, split on "Clump Thickness" with value 8
Node with impurity 0.44, split on "Normal Nuclei" with value 8
Node with impurity 0.37, split on "Cell Shape Uniformity" with value 6
Leaf with impurity 0.36


In [138]:
n.html_print()

'<table border=1 style="text-align:center"><tr style="text-align:center"><td colspan=2 style="text-align:center">Node with impurity 0.93, split on "Cell Size Uniformity" with value 4</td></tr><tr><td style="text-align:center" width=50%><table border=1 style="text-align:center"><tr style="text-align:center"><td colspan=2 style="text-align:center">Node with impurity 0.56, split on "Clump Thickness" with value 8</td></tr><tr><td style="text-align:center" width=50%><table border=1 style="text-align:center"><tr style="text-align:center"><td colspan=2 style="text-align:center">Node with impurity 0.44, split on "Normal Nuclei" with value 8</td></tr><tr><td style="text-align:center" width=50%><table border=1 style="text-align:center"><tr style="text-align:center"><td colspan=2 style="text-align:center">Node with impurity 0.37, split on "Cell Shape Uniformity" with value 6</td></tr><tr><td style="text-align:center" width=50%><table border=1 style="text-align:center"><tr style="text-align:center

In [139]:
n.predict(x=df.iloc[1])

array([1], dtype=uint8)

In [140]:
df.iloc[1]

Clump Thickness                 5
Cell Size Uniformity            4
Cell Shape Uniformity           4
Marginal Adhesion               5
Single Epithelial Cell Size     7
Bare Nuclei                    10
Bland Chromatin                 3
Normal Nuclei                   2
Mitoses                         1
Class                           2
Name: 1002945, dtype: int64

In [141]:
a = n.predict(x=df.iloc[1])

In [142]:
a

array([1], dtype=uint8)

In [144]:
y.iloc[1]

1