In [142]:
#importing needed modules
from random import seed
from random import randrange
from csv import reader

In [143]:
#function for loading the csv file
def load_csv(filename):
    file=open(filename,"rb")
    lines=reader(file)
    dataset=list(lines)
    return dataset



In [144]:
#function for converting string column to float
def str_column_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column].strip())
        
        

In [145]:
#function for calculating accuracy_percentage
def accuracy_metric(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1
    return correct/float(len(actual))*100



In [146]:
#function for splitting a dataset based on an attribute and an attribute value
def test_split(index,value,dataset):
    left,right=list(),list()
    for row in dataset:
        if row[index]<value:
            left.append(row)
        else:
            right.append(row)
    return left,right



In [147]:
#function for calculating the Gini Index for a split dataset
def gini_index(groups,class_values):
    gini=0.0
    for class_value in class_values:
        for group in groups:
            size=len(group)
            if size==0:
                continue
            proportion=[row[-1] for row in group].count(class_value)/float(size)
            gini+=(proportion*(1.0-proportion))
    return gini



In [148]:
#function for selecting the best split point for a dataset
def get_split(dataset):
    class_values=list(set(row[-1] for row in dataset))
    b_index,b_value,b_score,b_groups=999,999,999,None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups=test_split(index,row[index],dataset)
            gini=gini_index(groups,class_values)
            if gini<b_score:
                b_index,b_value,b_score,b_groups=index,row[index],gini,groups
    return {"index":b_index,"value":b_value,"groups":b_groups}

In [149]:
#function for creating a terminal node value
def to_terminal(group):
    outcomes=[row[-1] for row in group]
    return max(set(outcomes),key=outcomes.count)



In [150]:
#function for creating child splits for making a node or a terminal
def split(node,max_depth,min_size,depth):
    left,right=node["groups"]
    del (node["groups"])
    
    if not left or not right:
        node['left']=node['right']=to_terminal(left+right)
        return
    
    if depth>=max_depth:
        node["left"],node["right"]=to_terminal(left),to_terminal(right)
        return
    
    if len(left)<=min_size:
        node["left"]=to_terminal(left)
    else:
        node["left"]=get_split(left)
        split(node["left"],max_depth,min_size,depth+1)
        
    if len(right)<=min_size:
        node["right"]=to_terminal(right)
    else:
        node["right"]=get_split(right);
        split(node["right"],max_depth,min_size,depth+1)
        
        

In [151]:
#function for building a decision tree
def build_tree(train,max_depth,min_size):
    root=get_split(train)
    split(root,max_depth,min_size,1)
    return root



In [152]:
#function for making a prediction with a decision tree
def predict(node,row):
    if row[node["index"]]<node["value"]:
        if isinstance(node["left"],dict):
            return predict(node["left"],row)
        else:
            return node["left"]
    else:
        if isinstance(node["right"],dict):
            return predict(node["right"],row)
        else:
            return node["right"]
        
        

In [153]:
#main function for bringing all the things together
def mainAlgo(train,test,max_depth,min_size):
    tree=build_tree(train,max_depth,min_size)
    predictions=list()
    for row in test:
        prediction=predict(tree,row)
        predictions.append(prediction)
    actual=[row[-1] for row in test]
    for i in range(len(actual)):
        print ("Expected: "+str(actual[i])+"\tPredicted: "+str(predictions[i]))
    accuracy=accuracy_metric(actual,predictions)
    print ("Accuracy: "+str(accuracy)+"%")
    
    

In [154]:
#train & test the algorithm
train_filename="train.csv"
test_filename="test.csv"

train=load_csv(train_filename)
for i in range(len(train[0])):
    str_column_to_float(train,i)

test=load_csv(test_filename)
for i in range(len(test[0])):
    str_column_to_float(test,i)

max_depth=5
min_size=5
    
mainAlgo(train,test,max_depth,min_size)



    


Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	Predicted: 0.0
Expected: 0.0	