In [1]:
#An implementation of Algorithm

In [2]:
import pandas as pd
import numpy as np
import heapq
import math

In [3]:
# Read in the dataset
df = pd.DataFrame(pd.read_csv('../data/compas-binary.csv'))

In [4]:
x = df.as_matrix()[:,:13]

y = df.as_matrix()[:,13]

In [5]:
# Association Rule Mining (Only one feature)

#support
#supp = [(x[:,i]*y).mean() for i in range(13)]
#supp

In [6]:
#confidence
conf1 = [sum(x[:,i]*y)/sum(x[:,i]) for i in range(13)]
conf1

[0.3667168674698795,
 0.592274678111588,
 0.5300462249614792,
 0.5184782608695652,
 0.45773618016964024,
 0.32459016393442625,
 0.45069360675512665,
 0.44644229291532195,
 0.4233735747820255,
 0.4932330827067669,
 0.28986197049024276,
 0.37864823348694315,
 0.6614535418583257]

In [7]:
#confidence
conf0 = [sum((x[:,i]==0)*y)/sum((x[:,i]==0)) for i in range(13)]
conf0

[0.48557089084065247,
 0.4481314432989691,
 0.45573665707893896,
 0.45415065976281943,
 0.4676032110091743,
 0.4923509759099701,
 0.7527272727272727,
 0.7275,
 0.711558854718982,
 0.45544199390353235,
 0.5382854764877236,
 0.4822479928635147,
 0.37143460807099093]

In [8]:
#x_idx = [conf1[i]>0.5 or conf0[i]>0.5 for i in range(len(conf1))]
"""
Because Using both conf1 and conf0 would select out too many features, 
which is hard for the algorithm to run out,
we just use conf1 to select out a small fraction of feature.
"""
x_idx = [conf1[i]>0.5 for i in range(len(conf1))]
x_idx[0] = True # in the CORELS paper, gender is an important feature, so I add it manually
x_idx

[True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True]

In [9]:
#select out these features
x = x[:,x_idx]

In [10]:
nrule = x.shape[1]
ndata = len(y)

In [11]:
"""
calculate z, which is for the equivalent points bound
z is the vector defined in algorithm 5 of the CORELS paper
z is a binary vector indicating the data with a minority lable in its equivalent set
"""
z = pd.DataFrame([-1]*ndata).as_matrix()
# enumerate through theses samples
for i in range(ndata):
    #if z[i]==-1, this sample i has not been put into its equivalent set
    if z[i] == -1:
        tag1 = np.array([True]*ndata)
        for j in range(nrule):
            rule_label = x[i][j]
            #tag1 indicates which samples have exactly the same features with sample i
            tag1 = (x[:,j] == rule_label)*tag1
            
        y_l = y[tag1]
        pred = int(y_l.sum()/len(y_l) > 0.5)
        #tag2 indicates the samples in a equiv set which have the minority label
        tag2 = (y_l != pred)
        z[tag1,0] = tag2

z

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [1]])

In [12]:
def calcul(prefix, x, y):
    """
    Function for calculating the predictions, number of data captured,
    number of data incorrectly captured by the leaves, and b0 (defined in (28) in the CORELS paper).
    """
    prediction = []
    num_captured = []
    num_captured_incorrect = []
    B0 = [] # b0 is defined in (28)
    for i in range(len(prefix)):
        tag = np.array([True]*ndata)
        for j in range(len(prefix[i])):
            rule_index = abs(prefix[i][j])-1
            rule_label = int(prefix[i][j]>0)
            tag = (x[:,rule_index] == rule_label)*tag
            
        # the y's of these data captured by leaf prefix[i]
        y_leaf = y[tag]
        
        #b0 is defined in (28)
        b0 = tag.dot(z)[0]/ndata
        B0.append(b0)
        
        num_cap = len(y_leaf)
        num_captured.append(num_cap)
        
        if len(y_leaf)>0:
            pred = int(y_leaf.sum()/len(y_leaf) > 0.5)
            prediction.append(pred)
            num_cap_incor = sum(y_leaf != pred)
            num_captured_incorrect.append(num_cap_incor)
        else:
            prediction.append(0)
            num_captured_incorrect.append(0)
        
    return prediction, num_captured, num_captured_incorrect, B0

In [19]:
class CacheTree:
    """
    A tree data structure.
    prefix: a 2-d tuple to encode the leaves
    prediction: a list to record the predictions of leaves
    num_captured: a list to record number of data captured by the leaves
    num_captured_incorrect: a list to record number of data incorrectly captured by the leaves
    """
    def __init__(self, x, y, prefix,
                 lamb, prior_metric = None,
                 prediction=None,
                 num_captured=None,
                 num_captured_incorrect=None,
                 deadleaf = None,
                 splitleaf = None,
                 lbound=None,
                 B0 = None):
        self.prefix = prefix
        self.prediction = prediction
        self.num_captured = num_captured
        self.num_captured_incorrect = num_captured_incorrect
        self.deadleaf = deadleaf #a list indicate which leaves will never be split (support bound)
        self.splitleaf = splitleaf #a queue of list indicating which leaves will be split
        self.lbound = lbound #a list of lower bound
        self.B0 = B0 # a list of b0
        
        ndata = len(y)
        l = len(prefix)
        if prediction==None:
            self.prediction, self.num_captured, self.num_captured_incorrect, self.B0 = calcul(self.prefix, x, y)
            self.deadleaf = [0]*l
            self.splitleaf = [[1]*l]
            self.lbound = [sum(self.num_captured_incorrect[:i]+self.num_captured_incorrect[i+1:])/ndata + lamb*l 
                           for i in range(l)]
        
        # which metrics to use for the priority queue
        if (prior_metric=="curiosity"):
            self.curiosity = min([self.lbound[i]/((ndata-self.num_captured[i])/len(y)) 
                                  for i in range(l) if self.splitleaf[0][i]==1])
        elif (prior_metric=="bound"):
            self.curiosity = min([self.lbound[i]
                                  for i in range(l) if self.splitleaf[0][i]==1])
        elif (prior_metric=="entropy"): 
            self.p = [self.num_captured_incorrect[i]/self.num_captured[i] 
                      if self.num_captured[i]!=0 else 0 for i in range(l)]
            self.entropy = [(-self.p[i]*math.log2(self.p[i])-(1-self.p[i])*math.log2(1-self.p[i]))*self.num_captured[i] 
                            if self.p[i]!=0 and self.p[i]!=1 else 0 for i in range(l)]
            self.curiosity = min([sum(self.entropy[:i]+self.entropy[i+1:])/(ndata-self.num_captured[i]) for i in range(l)])
        elif (prior_metric=="gini"):
            self.p = [self.num_captured_incorrect[i]/self.num_captured[i] 
                      if self.num_captured[i]!=0 else 0 for i in range(l)]
            self.giniindex = [(2*self.p[i]*(1-self.p[i]))*self.num_captured[i] for i in range(l)]
            self.curiosity = min([sum(self.giniindex[:i]+self.giniindex[i+1:])/(ndata-self.num_captured[i]) for i in range(l)])
        else:
            self.curiosity = 0

            
    def get_prefix(self):
        # return the encoded tree
        return self.prefix
    
    def get_pred(self):
        # return a list of length len(prefix)
        # the predictions of all leaves
        return self.prediction
    
    def get_cap(self):
        # return a list of length len(prefix)
        # the number of captured points of all leaves
        return self.num_captured
    
    def get_ncc(self):
        # return a list of length len(prefix)
        # the number of incorrectly captured points of all leaves
        return self.num_captured_incorrect
    
    def get_deadleaf(self):
        # return a list of length len(prefix)
        # indicating whether or not the leaf is dead (because of the support bound)
        return self.deadleaf
    
    def get_splitleaf(self):
        # return a queue of lists of length len(prefix)
        # indicating whether or not the leaf will be split
        return self.splitleaf
    
    def set_deadleaf(self,i):
        # set leaf i to be dead
        self.deadleaf[i] = 1
        return
    
        
    def get_lbound(self):
        # return a list of length len(prefix)
        # the lower bound of the tree with leaf i as d0, the rest as dp
        return self.lbound
    
    
    def get_curiosity(self):
        # return the curiosity (to be used as metrics in priority queue)
        return self.curiosity
    
    def get_B0(self):
        # return a list of length len(prefix)
        # b0
        return self.B0
    
    def __lt__(self, other):
        # define <, which will be used in the priority queue
        return self.curiosity<other.curiosity

In [20]:
class Eliminate:
    """
    A data structure to record and identify
    whether a tree has been visited/pruned
    """
    def __init__(self, elim_dict = None, 
                 eliminated = None):
        self.elim_dict = {} # record these trees we have visited
        
    def eliminate(self, prefix):
        self.elim_dict[tuple(sorted(prefix))] = 1
        
    def is_duplicated(self, prefix):
        # if a tree is in the self.elim_dict, then we have already visited it
        if tuple(sorted(prefix)) in self.elim_dict.keys():
            #print("Eliminated!")
            return True
        return False

In [21]:
def Risk(tree,ndata,lamb):
    return tree.get_lbound()[0]+(tree.get_ncc()[0])/ndata+lamb*len(tree.get_prefix())

In [29]:
def bbound(x, y, lamb, prior_metric = None, MAXDEPTH = 4):
    """
    An implementation of Algorithm
    ## one copy of tree
    ## mark which leaves to split
    """
    
    d_c = None # the tree with the smallest risk
    R_c = 1 # the smallest risk

    nrule = x.shape[1]
    ndata = len(y)
    print("nrule:", nrule)
    print("ndata:", ndata)

    # initialize the queue to include all trees of just one split
    queue = []
    for r in range(1, nrule+1):
        tree0 = CacheTree(prefix = ((-r,),(r,)), x = x, y = y, lamb=lamb, prior_metric=prior_metric)
        heapq.heappush(queue, (tree0.get_curiosity(),tree0))
        "queue.append(tree0)"
        R = Risk(tree0,ndata,lamb)
        if R<R_c:
            d_c = tree0.get_prefix()
            R_c = R
    
    E = Eliminate()
    
    COUNT = 0 #count the total number of trees in the queue
    while (queue):
        "tree = queue.pop(0)"
        (curio, tree)=heapq.heappop(queue)
        d = tree.get_prefix()
        
        COUNT = COUNT+1
        #print("=======COUNT=======",COUNT)
        #print("d",d)
        #print("R",tree.get_lbound()[0]+(tree.get_ncc()[0])/len(y))
        
        # if we have visited this tree or it has been pruned
        if E.is_duplicated(d):
            continue
        else:
            E.eliminate(d)
        
        # enumerate through all the leaves
        for i in range(len(d)):
            # if the leaf is dead, then continue
            if tree.get_deadleaf()[i]==1:
                continue
            
            #(Lower bound on antecedent support)
            # if this bound doesnot hold, set the leaf to be dead, and continue
            if tree.get_cap()[i]/ndata/2 < lamb:
                tree.set_deadleaf(i)
                #print("==============dead==============",i)
                continue
            
            # the leaves we are going to split
            spl = tree.get_splitleaf()
            split_next = spl[1:] # the leaves to be split after this round
            
            # 0 for not split; 1 for split
            if spl[0][i]==0:
                continue

            # we are going to split leaf i, and get 2 new leaves
            # we will add the two new leaves to the end of the list
            splitleaf_list = [split_next[k][:i]+split_next[k][i+1:]+split_next[k][i]*2
                              for k in range(len(split_next))]
            
            d0 = d[i] #d0 is the leaf we are going to split
            dp = d[:i]+d[i+1:] #dp is the rest
            
            
            # Restrict the depth of the tree
            if len(d0)>=MAXDEPTH:
                continue
            
            
            lb = tree.get_lbound()[i] # the lower bound 
            b0 = tree.get_B0()[i] # the b0 defined in (28) of the paper
            
            #The equivalent points bound+look ahead bound and the hierarchical objective lower bound
            if lb+b0+lamb>=R_c or lb>=R_c:
                # split the leaf d0 with feature j
                for j in range(1, nrule+1):
                    if (j not in d0)and(-j not in d0):
                        # split leaf d0 with feature j, and get 2 leaves l1 and l2
                        l1 = d0+(-j,)
                        l2 = d0+(j,)
                        t = dp+(l1, l2) # t is the new tree
                        #print("t",t)
                        
                        # if tree t is duplicated, continue
                        if E.is_duplicated(t):
                            continue
                        
                        # for the two new leaves, calculate their predictions, 
                        # num of data captured, num of data incorrectly captured, and b0
                        pred_l, cap_l, incorr_l, B0_l = calcul((l1,l2),x,y)
                        
                        # calculate the bounds for each leaves in the new tree
                        loss_l1 = (incorr_l[0])/len(y)
                        loss_l2 = (incorr_l[1])/len(y)
                        loss_d0 = tree.get_ncc()[i]/len(y)
                        delta = loss_l1+loss_l2-loss_d0+lamb
                        old_lbound = tree.get_lbound()[:i]+tree.get_lbound()[i+1:]
                        new_lbound = [b+delta for b in old_lbound]+[tree.get_lbound()[i]+loss_l2+lamb,tree.get_lbound()[i]+loss_l1+lamb]
                        
                        #binary vector indicating split or not
                        splitleaf1 = [1]*(len(t)) #all leaves labeled as to be split
                        splitleaf2 = [0]*(len(t)-2)+[1,1] #l1,l2 labeled as to be split
                        splitleaf3 = [1]*(len(t)-2)+[0,0] #dp labeled as to be split
                        
                        splitleaf_array = np.array(splitleaf_list)
                        
                        #(Lower bound on accurate antecedent support)
                        a_l = (sum(cap_l)-sum(incorr_l))/ndata - sum(cap_l)/ndata/2
                        
                        if a_l < lamb:
                        # if the bound doesn't hold, we need to split the leaf l1/l2 further
                            
                            if len(splitleaf_list)>0:
                                split_l1_l2 = splitleaf_array[:,-1].sum()+splitleaf_array[:,-2].sum()
                                
                                # if dp will have been split
                                if splitleaf_array.sum()-split_l1_l2>0:
                                    
                                    # if l1/l2 will have been split
                                    if split_l1_l2>0:
                                        sl = splitleaf_list+[splitleaf1]
                                    
                                    # if l1/l2 will not have been split, we need to split l1/l2
                                    else:
                                        sl = splitleaf_list+[splitleaf2]
                                
                                # and we need to split leaves in dp, if dp will not have been split
                                else:
                                    
                                    # if l1/l2 will have been split
                                    if split_l1_l2>0:
                                        sl = splitleaf_list+[splitleaf3]
                                    
                                    # if l1/l2 will not have been split, we need to split l1/l2
                                    else:
                                        sl = splitleaf_list+[splitleaf2]+[splitleaf3]
                                
                                
                            
                        else:
                            
                            if len(splitleaf_list)>0:
                                split_l1_l2 = splitleaf_array[:,-1].sum()+splitleaf_array[:,-2].sum()
                                
                                # if dp will have been split
                                if splitleaf_array.sum()-split_l1_l2>0:
                                    sl = splitleaf_list+[splitleaf1]
                                
                                # and we need to split leaves in dp, if dp will not have been split
                                else:
                                    sl = splitleaf_list+[splitleaf3]
                                    
                        
                        #construct the new tree
                        tree_new = CacheTree(x = x, y = y, prefix = t,
                                             prediction = tree.get_pred()[:i]+tree.get_pred()[i+1:]+pred_l,
                                             num_captured = tree.get_cap()[:i]+tree.get_cap()[i+1:]+cap_l,
                                             num_captured_incorrect = tree.get_ncc()[:i]+tree.get_ncc()[i+1:]+incorr_l,
                                             deadleaf = tree.get_deadleaf()[:i]+tree.get_deadleaf()[i+1:]+[0,0],
                                             splitleaf = sl,
                                             lbound = new_lbound,
                                             B0 = tree.get_B0()[:i]+tree.get_B0()[i+1:]+B0_l,
                                             lamb = lamb,
                                             prior_metric=prior_metric
                                            )
                       
                        "queue.append(tree_new)"
                        heapq.heappush(queue, (tree_new.get_curiosity(),tree_new))
                        R = Risk(tree_new,ndata,lamb)
                        if R<R_c:
                            d_c = t
                            R_c = R
                        
            else:
                # split the leaf d0 with feature j
                for j in range(1, nrule+1):
                    if (j not in d0)and(-j not in d0):
                        # split leaf d0 with feature j, and get 2 leaves l1 and l2
                        l1 = d0+(-j,)
                        l2 = d0+(j,)
                        t = dp+(l1, l2) # t is the new tree
                        #print("t",t)
                        
                        # if tree t is duplicated, continue
                        if E.is_duplicated(t):
                            continue
                        
                        # for the two new leaves, calculate their predictions, 
                        # num of data captured, num of data incorrectly captured, and b0
                        pred_l, cap_l, incorr_l, B0_l = calcul((l1,l2),x,y)
                        
                        # calculate the bounds for each leaves in the new tree
                        loss_l1 = (incorr_l[0])/len(y)
                        loss_l2 = (incorr_l[1])/len(y)
                        loss_d0 = tree.get_ncc()[i]/len(y)
                        delta = loss_l1+loss_l2-loss_d0+lamb
                        old_lbound = tree.get_lbound()[:i]+tree.get_lbound()[i+1:]
                        new_lbound = [b+delta for b in old_lbound]+[tree.get_lbound()[i]+loss_l2+lamb,tree.get_lbound()[i]+loss_l1+lamb]
                        
                        #binary vector indicating split or not
                        splitleaf1 = [1]*len(t) #all leaves labeled as to be split
                        splitleaf2 = [0]*(len(t)-2)+[1,1] #l1,l2 labeled as to be split
                        
                        splitleaf_array = np.array(splitleaf_list)
                        
                        #(Lower bound on accurate antecedent support)
                        a_l = (sum(cap_l)-sum(incorr_l))/ndata - sum(cap_l)/ndata/2
                        if a_l < lamb:
                            # if the bound doesn't hold, we need to split the leaf l1/l2 further
                            
                            
                            if len(splitleaf_list)>0:
                                split_l1_l2 = splitleaf_array[:,-1].sum()+splitleaf_array[:,-2].sum()
                                
                                # if l1/l2 will have been split
                                if split_l1_l2>0:
                                    sl = splitleaf_list+[splitleaf1]
                                    
                                # if l1/l2 will not have been split, we need to split l1/l2
                                else:
                                    sl = splitleaf_list+[splitleaf2]
                                
                        else:
                            sl = splitleaf_list+[splitleaf1]
                        
                        #construct the new tree
                        tree_new = CacheTree(x = x, y = y, prefix = t,
                                             prediction = tree.get_pred()[:i]+tree.get_pred()[i+1:]+pred_l,
                                             num_captured = tree.get_cap()[:i]+tree.get_cap()[i+1:]+cap_l,
                                             num_captured_incorrect = tree.get_ncc()[:i]+tree.get_ncc()[i+1:]+incorr_l,
                                             deadleaf = tree.get_deadleaf()[:i]+tree.get_deadleaf()[i+1:]+[0,0],
                                             splitleaf = sl,
                                             lbound = new_lbound,
                                             B0 = tree.get_B0()[:i]+tree.get_B0()[i+1:]+B0_l,
                                             lamb = lamb,
                                             prior_metric=prior_metric
                                            )

                        "queue.append(tree_new)"
                        heapq.heappush(queue, (tree_new.get_curiosity(),tree_new))
                        R = Risk(tree_new,ndata,lamb)
                        if R<R_c:
                            d_c = t
                            R_c = R


    print("d_c", d_c)
    print("R_c", R_c)
    print("COUNT", COUNT)

array([2])

#### compas-binary

In [23]:
%%time
# 4 rules, all data, lambda = 0.0025############

bbound(x[:,1:],y,prior_metric="curiosity")

TypeError: bbound() missing 1 required positional argument: 'lamb'

In [26]:
%%time
# algorithm1_2splits, 3 rules, all data, lambda = 0.0025############

bbound(x[:,2:], y, lamb=0.04, prior_metric="curiosity")

nrule: 3
ndata: 6907
[]


IndexError: too many indices for array

In [825]:
%%time
# algorithm1_2splits, 5 rules, all data, lambda = 0.04

bbound(x,y)

nrule: 5
ndata: 6907
d_c ((-5,), (5,))
R_c 0.5210829593166353
COUNT 55005
CPU times: user 3min 31s, sys: 24 ms, total: 3min 31s
Wall time: 3min 31s


In [None]:
dddd