In [3]:
import pandas as pd
import math
import sys


'''the class Node takes as input:
1) attr: the name of the attribute
2) thres: the threshold value of the attribute
3) left: the left child of the node
4) right: the right child of the node
5) leaf: Shows whether the node is a leaf node or not
6) predict: If the node is a leaf node then it stores the prediction value, i.e. 0 for healthy and 1 for colic
'''
class Node(object):
    def __init__(self, attribute, threshold):
        self.attr = attribute
        self.thres = threshold
        self.left = None
        self.right = None
        self.leaf = False
        self.predict = None
        
'''the predict() function takes as input:
1) node: the node of the tree which the function is currently traversing 
2) row: a particular test or training instance from a dataframe

The function traverses the decision tree for a given row and returns the prediction value it obtained from the decision tree'''        
def predict(node, row):
    if node.leaf:
        return node.predict
    # Traverse left or right subtree based on instance's data
    if row[node.attr] <= node.thres:
        return predict(node.left, row)
    elif row[node.attr] > node.thres:
        return predict(node.right, row)
    
'''the predictions() function takes as input:
1) root: the decision tree
2) dataframe: the data which we want to predict 

The function iterates over the rows of the dataframe and calls the predict() function to obtain the prediction on each row.
It returns the percentage of correct predictions'''
def predictions(root, dataframe):
    num_data = dataframe.shape[0]
    num_correct = 0
    for index,row in dataframe.iterrows():
        prediction = predict(root, row)
        if prediction == row['Diagnosis']:
            num_correct += 1
    return round(num_correct/num_data, 2)*100

        
'''the entropy() function takes as input:
1) train: the training data
2) prediction: the diagnosis of the data

It calculates the entropy of the of the given data '''
def entropy(train, prediction):
    p_df = train[train[prediction] == 1]
    n_df = train[train[prediction] == 0]
    p = float(p_df.shape[0])
    n = float(n_df.shape[0])
    if p  == 0 or n == 0:
        I = 0
    else:
        I = ((-1*p)/(p + n))*math.log(p/(p+n), 2) + ((-1*n)/(p + n))*math.log(n/(p+n), 2)
    return I

'''the remainder() function takes as input:
1) train: the training dataset
2) train_parts: the parts which are divided according to the threshold value 
3) prediction: the diagnosis of the training data

The function returns the remainder of the given data
'''
def remainder(train, train_parts, prediction):
    num_data = train.shape[0]
    remainder = float(0)
    for part in train_parts:
        if part.shape[0] > 1:
            remainder = remainder + float(part.shape[0]/num_data)*entropy(part, prediction)
    return remainder
    
'''the information_gain function takes as input:
1) train: the training dataset
2) attribute: the selected attribute
3) prediction: the diagnosis of the training data
4) threshold: the threshold value of the selected attribute

The function returns the information gain for the given data
'''
    
def information_gain(train, attribute, prediction, threshold):
    #print("in information_gain")
    part1 = train[train[attribute] < threshold]
    part2 = train[train[attribute] > threshold]
    info_gain = entropy(train, prediction) - remainder(train, [part1, part2], prediction)
    return info_gain

'''the calculate_threshold function takes as input:
1) train: the training dataset
2) attribute: an attribute of the training data
3) prediction: the diagnosis of the training data

The function sorts the values corresponding to the given attributes. It then tries the threshold values that are between 
successive attribute values. It checks the information gain using each threshold value and selects the threshold value which 
returns the maximum value of information gain. The funtion returns the threshold value of the given attribute
'''
def calculate_threshold(train, attribute, prediction):
    values = train[attribute].tolist()
    values = [ float(x) for x in values]
    values = set(values)
    values = list(values)
    values.sort()
    max_gain = float("-inf")
    threshold = 0
    for i in range(0, len(values) - 1):
        thres = (values[i] + values[i+1])/2
        info_gain = information_gain(train, attribute, prediction, thres)
        if info_gain > max_gain:
            max_gain = info_gain
            threshold = thres
    # Return the threshold value that maximizes information gained
    return threshold

'''the select_attribute() function takes as input:
1) train: the training dataset
2) columns: a list of columns of the training data
3) prediction: the diagnosis of the training data

The function checks the information gain of each attribute and selects the attribute having the highest value of information gain.
It returns the best attribute and its threshold value
'''    
def select_attribute(train, columns, prediction):
    #print("in select_attribute")
    max_info_gain = float("-inf")
    best_attr = None
    threshold = 0
    for attribute in columns:
        thres = calculate_threshold(train, attribute, prediction)
        ig = information_gain(train, attribute, prediction, thres)
        if ig > max_info_gain:
            max_info_gain = ig
            best_attr = attribute
            threshold = thres
            
    print("the attribute selected is",best_attr)
    print("the threshold value is: ",threshold)
    return best_attr, threshold

'''the decision_tree function takes as input:
1) train: the training dataset
2) columns: a list of columns of the training data
3) prediction: the diagnosis of the training data

The function selects the best attribute using information gain. It then divides the data into two portions using the 
threshold value and then recursively call itself on the divided portions. When the data has all positive or all negative examples
it creates a leaf node and assigns a value to the leaf node. The function returns as decision tree'''    
def decision_tree(train, columns, prediction):
    #print("In decision_tree")
    p_df = train[train[prediction] == 1]
    n_df = train[train[prediction] == 0]
    p = p_df.shape[0]
    n = n_df.shape[0]
    #print("positive examples are: ", p)
    #print("negative examples are: ",n)
    
    '''If there are only positive or negative examples, then we have reached the end of the tree'''
    if p == 0 or n == 0:
        leaf = Node(None,None)
        leaf.leaf = True
        if p > n:
            leaf.predict = 1
        else:
            leaf.predict = 0
        return leaf
    else:
        best_attr, threshold = select_attribute(train, columns, prediction)
        tree = Node(best_attr, threshold)
        part1 = train[train[best_attr] < threshold]
        part2 = train[train[best_attr] > threshold]
        #print("part1: ",part1)
        #print("part2: ",part2)
        tree.left = decision_tree(part1, columns, prediction)
        tree.right = decision_tree(part2, columns, prediction)
        return tree


'''the print_tree() takes as input:
1) root: the decision tree
2) level: the level from where you want to print

The function prints the decision tree'''    
def print_tree(root, level):

    if root.leaf:
        print(root.predict)
    else:
        print(root.attr)
    if root.left:
        print_tree(root.left, level + 1)
    if root.right:
        print_tree(root.right, level + 1)

    
    
    
if __name__=='__main__':
    
    '''preparing the training data and storing it in a pandas dataframe. Add columns to the dataframe and replace 
    diagnosis with another column named predictions which stores value 1 for colic and 0 for healthy'''
    
    print("Enter the path of the train file eg(E:\waterloo_documents\AI\Assignment2\horseTrain.txt)")
    train_file_name = input()
    train_data = pd.read_csv(train_file_name, header=None)
    train_data.columns = ['K', 'Na', 'CL', 'HCO', 'Endotoxin', 'Anioingap', 'PLA2', 'SDH', 'GLDH', 'TPP', 'Breath rate', 'PCV', 'Pulse rate', 'Fibrinogen', 'Dimer', 'FibPerDim', 'Diagnosis']
    outcome = {'colic.': 1,'healthy.': 0} 
    train_data.Diagnosis = [outcome[item] for item in train_data.Diagnosis] 
    #print(train_data)
    
    '''Calling the decision tree'''
    attributes =  ['K', 'Na', 'CL', 'HCO', 'Endotoxin', 'Anioingap', 'PLA2', 'SDH', 'GLDH', 'TPP', 'Breath rate', 'PCV', 'Pulse rate', 'Fibrinogen', 'Dimer', 'FibPerDim']
    root = decision_tree(train_data, attributes, 'Diagnosis')
    print("tree is")
    print_tree(root, 0)
    prediction = predictions(root, train_data)
    print("the prediction on training data is:", prediction,"%")
    
    '''preparing test data and predicting its outcome'''
    print("Enter the path of the test file eg(E:\waterloo_documents\AI\Assignment2\horseTest.txt)")
    test_file_name = input()
    test_data = pd.read_csv(test_file_name, header=None)
    test_data.columns = ['K', 'Na', 'CL', 'HCO', 'Endotoxin', 'Anioingap', 'PLA2', 'SDH', 'GLDH', 'TPP', 'Breath rate', 'PCV', 'Pulse rate', 'Fibrinogen', 'Dimer', 'FibPerDim', 'Diagnosis']
    outcome = {'colic.': 1,'healthy.': 0} 
    test_data.Diagnosis = [outcome[item] for item in test_data.Diagnosis] 
    #print(test_data)
    prediction_test = predictions(root, test_data)
    print("the prediction on test data is: ", prediction_test,"%")


Enter the path of the train file eg(E:\waterloo_documents\AI\Assignment2\horseTrain.txt)
E:\waterloo_documents\AI\Assignment2\horseTrain.txt
the attribute selected is Endotoxin
the threshold value is:  13.425
the attribute selected is K
the threshold value is:  3.55
the attribute selected is Na
the threshold value is:  141.5
the attribute selected is GLDH
the threshold value is:  24.65
tree is
Endotoxin
K
Na
1
0
GLDH
0
1
1
the prediction on training data is: 100.0 %
Enter the path of the test file eg(E:\waterloo_documents\AI\Assignment2\horseTest.txt)
E:\waterloo_documents\AI\Assignment2\horseTest.txt
the prediction on test data is:  100.0 %
