In [2]:
from collections import namedtuple, Counter, defaultdict
import math
import numpy as np
from sklearn import tree

In [129]:
from sklearn.model_selection import train_test_split
import pandas as pd

d_meta = pd.read_csv('MetaClassifier_TripletFeatures.csv')
labels = list(d_meta)
data = d_meta.as_matrix()[:,:] 

Y = data[:, -1]
X = data[:,0:-1]

training_set_X, test_set_X, training_set_Y, test_set_Y, = train_test_split(X, Y, test_size=0.33)

In [131]:
PET_and_MRI_length = 60
Additional_cost_for_imaging = 1000000
feature_costs = {'1':20, '2':20,'3':0,'4':2*60+247+Additional_cost_for_imaging, '5':30, '6':45, '7':45,'8':10,'9':45,'10':10,'11':12,'12':10,'13':10}
model_info = {}
with open("Models_MetaClassifier.csv", "r") as ins:
    for line in ins:
        info = line.rstrip('\n').rstrip('\r').split(",")
        model_info[info[0]] = info[1:]

global model_cost 
model_cost = []
for i in labels[:-1]:
    model_num = i[i.index('l')+1:]
    model_string = 'Model '+str(model_num)
    cost = 0
    for feature in model_info[model_string]:
        if feature != "":
            cost += feature_costs[feature]
    model_cost.append(cost)
    
global model_condition_cost 
model_condition_cost = {}
for i in labels[:-1]:
    model_num1 = i[i.index('l')+1:]
    model_string = 'Model '+str(model_num1)
    model_condition_cost[int(model_num1)] = {}
    for j in labels[:-1]:
        model_num2 = j[j.index('l')+1:]
        model_string2 = 'Model '+str(model_num2)
        
        features1 = model_info[model_string]
        features2 = model_info[model_string2]
        additional_cost = 0
        for f in features2:
            if f not in features1:
                additional_cost += feature_costs[f]
                
        model_condition_cost[int(model_num1)][int(model_num2)] = additional_cost
        
#print model_condition_cost

In [125]:
class DecisionNode(object):
    """Makes Decision Node Class"""
    def __init__(self,feature=None,left=None,right=None,classes = None):
        self.left = left
        self.right = right
        self.feature = feature
        self.classes = classes
    
    def is_leaf(self):
        return self.left == None and self.right == None

    def __repr__(self):
        if self.feature == None:
            return "Decision Leaf with classes " + str(self.classes)
        return "Decision node for feature " + str(self.feature)

class DecisionTree(object):
    """Decision Tree Class"""
    def __init__(self):
        self.root_node = None
        
    def predict(self, X):
        predicted_classes = []

        for sample in X:
            c = None
            current_node = self.root_node
            while c is None:
                if current_node.is_leaf():
                    try:
                        c = int(current_node.classes)
                    except:
                        ones = sum(current_node.classes)
                        zeros = len(current_node.classes)-ones
                        if ones > zeros:
                            c = 1
                        else:
                            c = 0
                    
                else:
                    key_value = sample[current_node.feature]
                    if key_value == 0:
                        current_node = current_node.left
                    elif key_value == 1:
                        current_node = current_node.right
            predicted_classes.append(c)
        return predicted_classes
    
    def fit(self,samples,outcome_variable,features_to_ignore):
        """Takes in training data and builds a decision tree
        samples = X values as list of list
        outcome_varaible = Y value
        """
        training_samples = [(s, t) for s, t in zip(samples, outcome_variable)]
        predicting_features = list(range(len(samples[0])))
        print predicting_features
        for i in features_to_ignore:
            del predicting_features[predicting_features.index(i)]
        print predicting_features
        self.root_node = self.build_decision_tree(training_samples,predicting_features)
        
    def build_decision_tree(self,samples,features):
        
        classes = [sample[1] for sample in samples]
        if len(set(classes)) == 1:
            root_node = DecisionNode(feature=None, left=None, right=None, classes = classes[0])
    
        elif features == []:
            return  DecisionNode(feature=None, left=None, right=None, classes = [sample[1] for sample in samples])

        else:
            best_feature = self.select_best_feature(samples,features,classes)
            print "BEST FEATURE THIS ITERATION IS ", best_feature
            if best_feature == 'stop':
                #print "STOPPING"
                root_node = DecisionNode(feature=None, left=None, right=None, classes = [sample[1] for sample in samples])
                return root_node
            
            best_feature_values = [s[0][best_feature] for s in samples]
            if len(best_feature_values) == 1:
                #print "making leaf"
                root_node = DecisionNode(feature = best_feature, classes = best_feature_values[0])
            else:
                #do left hand side
                left_samples = [s for s in samples if s[0][best_feature] == 0]
                left_node = self.build_decision_tree(left_samples,features)
                
                #do right hand side
                right_samples = [s for s in samples if s[0][best_feature] == 1]
                right_node = self.build_decision_tree(right_samples,features)

                root_node = DecisionNode(feature = best_feature, classes = best_feature_values, left = left_node, right= right_node)

        return root_node
    
    
    def print_tree(self,labels):
        curr_node = self.root_node
        print self.__str__(curr_node,0)

    def __str__(self, node, depth=0):
        ret = ""
        # Print right branch
        if node.right != None:
            ret += self.__str__(node.right,depth + 1)
        # Print own value
        if node.feature != None:
            ret += "\n" + ("    "*depth) + str(node.feature)
        else:
            ret += "\n" + ("    "*depth) #+ "Class: "+str(node.classes)
        # Print left branch
        if node.left != None:
            ret += self.__str__(node.left,depth + 1)
        return ret
    
    
    def select_best_feature(self, samples, features, classes):
        """
        Find score for all remaining features, choose the one that maximizes
        the score function and delete this feature from consideration
        """
        gain_factors = [(self.score_function(samples, feat, classes, features), feat)
                        for feat in features]
        gain_factors.sort()
        print "GAIN FACTORS", gain_factors
        best_feature = gain_factors[-1][1]
        if gain_factors[-1][0] <= 0:
            return "stop"
        features.pop(features.index(best_feature))
        return best_feature


    def information_gain(self, samples, feature, classes):
        """
        Information gain is the measure of the difference in entropy from before
        to after the samples are split on the given feature values. In other
        words, how much uncertainty in the samples was reduced after splitting
        them on the given feature.
        """
        #print "splitting by", feature
        N = len(samples)
        samples_partition = defaultdict(list)
        for s in samples:
            samples_partition[s[0][feature]].append(s)
        feature_entropy = 0.0
        for partition in samples_partition.values():
            sub_classes = [s[1] for s in partition]
            feature_entropy += (len(partition) / float(N)) * self.entropy(sub_classes)
        #print "Child is", feature_entropy
        p = self.entropy(classes)
        #print "Parent is", p
        return p, feature_entropy

    @staticmethod
    def entropy(dataset):
        """Measure of the amount of uncertainty in the given dataset."""

        N = len(dataset)
        counter = Counter(dataset)
        #rint counter
        return sum([-1.0*(counter[k] / float(N))*math.log(counter[k] / float(N),2) for k in counter])
    
    def cost(self, feature):
        global model_cost
        return model_cost[feature]
        #return c[feature]
    
    def cost_conditional(self, feature, feature2):
        global model_condition_cost
        return model_condition_cost[feature][feature2]
    
    def P_L(self, samples, feature, classes):
        parent_entrop, child_entrop = self.information_gain(samples, feature, classes)
        #print "parent",parent_entrop
        #print "child",child_entrop
        return 1- (child_entrop)/float(parent_entrop)
    
    def P_L_conditional(self, samples, feature, classes, features_left):
        left = [i for i in samples if i[0][feature] == 0]
        right = [i for i in samples if i[0][feature] == 1]
        
        hypothetical_featuresleft = [i for i in features_left]
        hypothetical_featuresleft.pop(hypothetical_featuresleft.index(feature))
        
        sums = 0
        #left split
        if left != []:
            for j in hypothetical_featuresleft:
                #print "conditional try", j
                pl = self.P_L(left, j, classes)
                #print pl,self.cost_conditional(feature,j)
                sums += pl*self.cost_conditional(feature,j)
                
        #right split
        if right != []:
            for j in hypothetical_featuresleft:
                #print "conditional try", j
                pl = self.P_L(right, j, classes)
                #print pl,self.cost_conditional(feature,j)
                sums += pl*self.cost_conditional(feature,j)
                
        return sums
    
    
    def Estimated_Cost(self, samples, feature, classes, features_left):
        PL = self.P_L(samples, feature, classes)
        #print "PL is", PL
        if PL > 1:
            print "PLLLLL", PL
        if PL == 1:
            return PL*self.cost(feature)
        first_part = PL*self.cost(feature)
        second_part = (1-PL)*self.P_L_conditional(samples, feature, classes, features_left) #need_features_left
        #print "2nd",second_part
        return first_part + second_part
    
    
    
    def score_function(self,samples, feature, classes, features_left):
        #print "FEATURE IS", feature
        info_gain1, info_gain2 = self.information_gain(samples, feature, classes)
        print "         Info gain is",info_gain1- info_gain2
        estimated_cost = self.Estimated_Cost(samples, feature, classes, features_left)
        if estimated_cost == 0:
            return 0
        print "Estimated cost for feature",feature, "is ", estimated_cost
        #print "Score is ", (info_gain1-info_gain2)/estimated_cost
        return (info_gain1-info_gain2)/float((estimated_cost))
    
    






In [6]:
"""Function to print the Decision Tree Classifier from sklearn to compare results """
def get_code(tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]
        value = tree.tree_.value

        def recurse(left, right, threshold, features, node):
                if (threshold[node] != -2):
                        print "if ( " + features[node] + " <= " + str(threshold[node]) + " ) {"
                        if left[node] != -1:
                                recurse (left, right, threshold, features,left[node])
                        print "} else {"
                        if right[node] != -1:
                                recurse (left, right, threshold, features,right[node])
                        print "}"
                else:
                        print "return " + str(value[node])

        recurse(left, right, threshold, features, 0)
        

In [6]:

samples = np.array([[0,0,1,0,1],[1,1,1,1,0],[1,0,0,1,0],[1,0,0,1,0],[1,0,0,1,0]])
targets = np.array([1,0,1,0,1])

#samples = [[0,0,0,0,1],[1,1,1,1,0],[0,0,0,1,0],[1,0,0,1,0]]
#targets = [1,0,0,1]

#samples = [[0,0,1,0,1],[1,1,1,1,0],[0,0,0,1,0],[1,0,0,1,0]]
#targets = [1,0,1,1]

#samples = [[0,0,1,0,1],[1,1,1,1,0],[0,0,0,1,1],[1,0,0,1,0],[0,1,1,1,0],[1,1,0,0,0]]
#targets = [1,0,1,1,0,1]

#samples = [[1,1,1,0,1],[1,1,0,1,0],[0,0,0,1,1],[0,0,0,1,0],[0,1,1,1,0],[1,1,0,0,0]]
#targets = [1,0,1,0,0,1]


d = DecisionTree()
d.fit(samples,targets)
d.print_tree()

print "\nSklearn tree"
dtree = tree.DecisionTreeClassifier(criterion = 'entropy')
dtree = dtree.fit(samples,targets)
get_code(dtree,['0','1','2','3','4','5'])


print d.predict([[0,0,0,1,0]])


BEST FEATURE THIS ITERATION IS  1
BEST FEATURE THIS ITERATION IS  4
BEST FEATURE THIS ITERATION IS  stop

    Class: 0
1
        Class: 1
    4
        Class: [1, 0, 1]

Sklearn tree
if ( 1 <= 0.5 ) {
if ( 4 <= 0.5 ) {
return [[ 1.  2.]]
} else {
return [[ 0.  1.]]
}
} else {
return [[ 1.  0.]]
}
[1]


In [5]:
from sklearn import tree
dtree = tree.DecisionTreeClassifier(criterion = 'entropy')

dtree = dtree.fit(training_set_X,training_set_Y)
get_code(dtree,[str(i) for i in range(8191)])

if ( 306 <= 0.5 ) {
if ( 49 <= 0.5 ) {
if ( 295 <= 0.5 ) {
if ( 25 <= 0.5 ) {
return [[ 129.    0.]]
} else {
if ( 91 <= 0.5 ) {
if ( 157 <= 0.5 ) {
if ( 34 <= 0.5 ) {
if ( 14 <= 0.5 ) {
if ( 158 <= 0.5 ) {
if ( 209 <= 0.5 ) {
if ( 13 <= 0.5 ) {
return [[ 13.   1.]]
} else {
return [[ 1.  0.]]
}
} else {
return [[ 1.  0.]]
}
} else {
return [[ 2.  0.]]
}
} else {
return [[ 2.  0.]]
}
} else {
return [[ 3.  0.]]
}
} else {
return [[ 10.   0.]]
}
} else {
return [[ 41.   0.]]
}
}
} else {
if ( 98 <= 0.5 ) {
return [[ 2.  0.]]
} else {
return [[ 0.  1.]]
}
}
} else {
if ( 17 <= 0.5 ) {
if ( 218 <= 0.5 ) {
if ( 100 <= 0.5 ) {
return [[ 3.  0.]]
} else {
return [[ 0.  3.]]
}
} else {
return [[ 11.   0.]]
}
} else {
return [[ 0.  4.]]
}
}
} else {
if ( 291 <= 0.5 ) {
if ( 362 <= 0.5 ) {
if ( 169 <= 0.5 ) {
return [[ 5.  0.]]
} else {
if ( 38 <= 0.5 ) {
return [[ 0.  1.]]
} else {
return [[ 1.  0.]]
}
}
} else {
if ( 195 <= 0.5 ) {
if ( 329 <= 0.5 ) {
return [[ 0.  4.]]
} else {
return [[ 5. 

In [21]:
d = DecisionTree()
d.fit(training_set_X,training_set_Y)
d.print_tree()

y_pred = d.predict(test_set_X)

count = []
recall = []
for i in range(len(y_pred)):
    if y_pred[i] == test_set_Y[i]:
        count.append(1)
    else:
        count.append(0)
        
    if test_set_Y[i] == 1:
        if y_pred[i] == 1:
            recall.append(1)
        else:
            recall.append(0)
        


BEST FEATURE THIS ITERATION IS  346
BEST FEATURE THIS ITERATION IS  13
BEST FEATURE THIS ITERATION IS  323
BEST FEATURE THIS ITERATION IS  14
BEST FEATURE THIS ITERATION IS  4
BEST FEATURE THIS ITERATION IS  370
BEST FEATURE THIS ITERATION IS  345
BEST FEATURE THIS ITERATION IS  248
BEST FEATURE THIS ITERATION IS  25
BEST FEATURE THIS ITERATION IS  10
BEST FEATURE THIS ITERATION IS  3
BEST FEATURE THIS ITERATION IS  22
BEST FEATURE THIS ITERATION IS  102
BEST FEATURE THIS ITERATION IS  165
BEST FEATURE THIS ITERATION IS  238
BEST FEATURE THIS ITERATION IS  254
BEST FEATURE THIS ITERATION IS  173
BEST FEATURE THIS ITERATION IS  7
BEST FEATURE THIS ITERATION IS  stop
BEST FEATURE THIS ITERATION IS  266
BEST FEATURE THIS ITERATION IS  157
BEST FEATURE THIS ITERATION IS  109
BEST FEATURE THIS ITERATION IS  365
BEST FEATURE THIS ITERATION IS  360
BEST FEATURE THIS ITERATION IS  376
BEST FEATURE THIS ITERATION IS  15
BEST FEATURE THIS ITERATION IS  99
BEST FEATURE THIS ITERATION IS  36
BEST 

In [22]:
print sum(count)/float(len(count))
print sum(recall)/float(len(recall))

0.910526315789
0.851351351351


In [7]:
d = DecisionTree()
d.fit(training_set_X,training_set_Y)
d.print_tree()

y_pred = d.predict(test_set_X)

count = []
recall = []
for i in range(len(y_pred)):
    if y_pred[i] == test_set_Y[i]:
        count.append(1)
    else:
        count.append(0)
        
    if test_set_Y[i] == 1:
        if y_pred[i] == 1:
            recall.append(1)
        else:
            recall.append(0)

BEST FEATURE THIS ITERATION IS  306
BEST FEATURE THIS ITERATION IS  164
BEST FEATURE THIS ITERATION IS  236
BEST FEATURE THIS ITERATION IS  14
BEST FEATURE THIS ITERATION IS  91
BEST FEATURE THIS ITERATION IS  3
BEST FEATURE THIS ITERATION IS  22
BEST FEATURE THIS ITERATION IS  11
BEST FEATURE THIS ITERATION IS  10
BEST FEATURE THIS ITERATION IS  13
BEST FEATURE THIS ITERATION IS  7
BEST FEATURE THIS ITERATION IS  254
BEST FEATURE THIS ITERATION IS  4
BEST FEATURE THIS ITERATION IS  165
BEST FEATURE THIS ITERATION IS  stop
BEST FEATURE THIS ITERATION IS  280
BEST FEATURE THIS ITERATION IS  102
BEST FEATURE THIS ITERATION IS  248
BEST FEATURE THIS ITERATION IS  107
BEST FEATURE THIS ITERATION IS  235
BEST FEATURE THIS ITERATION IS  15
BEST FEATURE THIS ITERATION IS  141
BEST FEATURE THIS ITERATION IS  314
BEST FEATURE THIS ITERATION IS  42
BEST FEATURE THIS ITERATION IS  246
BEST FEATURE THIS ITERATION IS  48
BEST FEATURE THIS ITERATION IS  1
BEST FEATURE THIS ITERATION IS  204
BEST FEA

In [29]:
print sum(count)/float(len(count))
print sum(recall)/float(len(recall))

0.910526315789
0.86301369863


In [32]:
d = DecisionTree()
d.fit(training_set_X,training_set_Y)
d.print_tree()

y_pred = d.predict(test_set_X)

count = []
recall = []
for i in range(len(y_pred)):
    if y_pred[i] == test_set_Y[i]:
        count.append(1)
    else:
        count.append(0)
        
    if test_set_Y[i] == 1:
        if y_pred[i] == 1:
            recall.append(1)
        else:
            recall.append(0)

BEST FEATURE THIS ITERATION IS  258
BEST FEATURE THIS ITERATION IS  299
BEST FEATURE THIS ITERATION IS  1
BEST FEATURE THIS ITERATION IS  276
BEST FEATURE THIS ITERATION IS  10
BEST FEATURE THIS ITERATION IS  14
BEST FEATURE THIS ITERATION IS  3
BEST FEATURE THIS ITERATION IS  13
BEST FEATURE THIS ITERATION IS  22
BEST FEATURE THIS ITERATION IS  11
BEST FEATURE THIS ITERATION IS  260
BEST FEATURE THIS ITERATION IS  257
BEST FEATURE THIS ITERATION IS  285
BEST FEATURE THIS ITERATION IS  7
BEST FEATURE THIS ITERATION IS  209
BEST FEATURE THIS ITERATION IS  stop
BEST FEATURE THIS ITERATION IS  25
BEST FEATURE THIS ITERATION IS  40
BEST FEATURE THIS ITERATION IS  224
BEST FEATURE THIS ITERATION IS  30
BEST FEATURE THIS ITERATION IS  0
BEST FEATURE THIS ITERATION IS  91
BEST FEATURE THIS ITERATION IS  26
BEST FEATURE THIS ITERATION IS  165
BEST FEATURE THIS ITERATION IS  177
BEST FEATURE THIS ITERATION IS  164
BEST FEATURE THIS ITERATION IS  stop
BEST FEATURE THIS ITERATION IS  173
BEST FEA

In [33]:
print sum(count)/float(len(count))
print sum(recall)/float(len(recall))

0.910526315789
0.835616438356


In [132]:
d = DecisionTree()
#d.fit(training_set_X,training_set_Y,[3,15,26,36,46,47,48,49,50,51,52,53,54])
d.fit(training_set_X,training_set_Y,[3,15,26,36,46,47,48,49,50,51,52,53,54,92,102,112,113,114,115,116,117,118,119,120,157,167,168,169,170,171,172,173,174,175,212,213,214,215,216,217,218,219,220]+list(range(257,293)))
d.print_tree(labels)

y_pred = d.predict(test_set_X)

count = []
recall = []
for i in range(len(y_pred)):
    if y_pred[i] == test_set_Y[i]:
        count.append(1)
    else:
        count.append(0)
        
    if test_set_Y[i] == 1:
        if y_pred[i] == 1:
            recall.append(1)
        else:
            recall.append(0)
            
print sum(count)/float(len(count))
print sum(recall)/float(len(recall))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [61]:
def e(a,b,t):
    if a == 0:
        return -b/float(t)*math.log(b/float(t),2)
    elif b == 0:
        return -a/float(t)*math.log(a/float(t),2)
    return -a/float(t)*math.log(a/float(t),2)-b/float(t)*math.log(b/float(t),2)

print e(10,7,17)
print 1-(17/20.0*e(7,10,17)+3/20.0*e(1,2,3))/e(8,12,20)

0.0903175363266

print (+9/17.0*e(2,7,9))

print 1-0.91829/float(0.968549)

0.977417817528
0.00247291104332
0.404578856387
0.0518910246152


In [62]:
0.0024729*20+(1-0.0024729)*(0.33074*20+-0.00915*0+0.5822836*10367+0.311692*20+0.05189*0+0.311692*10367)

9257.792915785869

In [69]:
print 0.0024/9286.0
print 0.2948/7841.0
print 0
print 0.506/5425.0

2.58453586044e-07
3.75972452493e-05
0
9.32718894009e-05
