In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Any results you write to the current directory are saved as output.

/kaggle/input/liverpatient/liver_patient.csv


In [2]:
from __future__ import print_function # For Python 2 / 3 compatability

## CART Decision Tree

### Dataset used to build the model: ILPD (Indian Liver Patient Dataset) Data Set 

### Taken from https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset)

### Source of dataset: 
    1. Bendi Venkata Ramana 
    ramana.bendi '@' gmail.com 
    Associate Professor, 
    Department of Information Technology, 
    Aditya Instutute of Technology and Management, 
    Tekkali - 532201, Andhra Pradesh, India. 

    2. Prof. M. Surendra Prasad Babu 
    drmsprasadbabu '@' yahoo.co.in 
    Deptartment of Computer Science & Systems Engineering, 
    Andhra University College of Engineering, 
    Visakhapatnam-530 003 Andhra Pradesh, India. 

    3. Prof. N. B. Venkateswarlu
    venkat_ritch '@' yahoo.com 
    Department of Computer Science and Engineering, 
    Aditya Instutute of Technology and Management, 
    Tekkali - 532201, Andhra Pradesh, India.

#### The construction of this CART Algorithm is massively helped by Josh Gordon - https://github.com/random-forests.

## Data Preparation

#### First of all download the dataset from the source stated above and add the data to your notebook. For this particular example, the data (which named liver_patient.csv) will be placed inside a folder named liverpatient and liverpatient is located inside input folder (kaggle's practice). But you can do it your own way too.

In [3]:
"""Create the column names and load the dataset with it"""

col_names = ['age', 'sex', 'total_bilirubin', 'direct_bilirubin', 'alkaline', 'alamine',
            'aspartate', 'total_protein', 'albumin', 'A/G Ratio', 'label']

dataset = pd.read_csv("../input/liverpatient/liver_patient.csv", header=None, names=col_names)

"""Split the dataset into features/attributes and target/label"""

feature_cols = ['age', 'sex', 'total_bilirubin', 'direct_bilirubin', 'alkaline', 'alamine',
            'aspartate', 'total_protein', 'albumin', 'A/G Ratio']

classification_data = dataset[feature_cols] # Features
classification_label = dataset.label # Target variable

In [4]:
"""Create array for each dataset."""

dataset_array = dataset.to_numpy()
class_dataset = classification_data.to_numpy()
label_dataset = classification_label.to_numpy()

"""Also create a header for the Questions."""

header = ['age', 'sex', 'total_bilirubin', 'direct_bilirubin', 'alkaline', 'alamine', 
          'aspartate', 'total_protein', 'albumin', 'A/G Ratio', 'label']

### Split Dataset into Training and Test Sets

In [5]:
to_be_splitted = pd.read_csv("../input/liverpatient/liver_patient.csv", header=None, names=col_names) #load the dataset

dataset_copy = to_be_splitted.copy()
train_set = dataset_copy.sample(frac=0.60, random_state=0)
test_set = dataset_copy.drop(train_set.index)

#### Run the code below to see the results of data splitting: the training set and test set.

In [6]:
print("Training Set: ")
display(train_set)
print("Test Set: ")
display(test_set)

Training Set: 


Unnamed: 0,age,sex,total_bilirubin,direct_bilirubin,alkaline,alamine,aspartate,total_protein,albumin,A/G Ratio,label
18,26,Female,0.6,0.2,142.0,12.0,32,5.7,2.4,0.75,1
170,20,Male,1.1,0.5,128.0,20.0,30,3.9,1.9,0.95,2
107,22,Male,0.8,0.2,300.0,57.0,40,7.9,3.8,0.90,2
98,60,Male,19.6,9.5,466.0,46.0,52,6.1,2.0,0.40,1
177,28,Female,1.0,0.3,90.0,18.0,108,6.8,3.1,0.80,2
...,...,...,...,...,...,...,...,...,...,...,...
161,65,Female,0.7,0.2,406.0,24.0,45,7.2,3.5,0.90,2
23,61,Male,0.7,0.2,145.0,53.0,41,5.8,2.7,0.87,1
186,33,Male,1.6,0.5,165.0,15.0,23,7.3,3.5,0.92,2
135,32,Male,0.7,0.2,165.0,31.0,29,6.1,3.0,0.96,2


Test Set: 


Unnamed: 0,age,sex,total_bilirubin,direct_bilirubin,alkaline,alamine,aspartate,total_protein,albumin,A/G Ratio,label
1,29,Male,1.0,0.3,75.0,25.0,26,5.1,2.9,1.30,1
3,45,Male,1.1,0.4,92.0,91.0,188,7.2,3.8,1.11,1
6,58,Male,0.4,0.1,100.0,59.0,126,4.3,2.5,1.40,1
9,35,Male,26.3,12.1,108.0,168.0,630,9.2,2.0,0.30,1
11,42,Male,0.8,0.2,127.0,29.0,30,4.9,2.7,1.20,1
...,...,...,...,...,...,...,...,...,...,...,...
190,84,Female,0.7,0.2,188.0,13.0,21,6.0,3.2,1.10,2
192,29,Male,0.8,0.2,156.0,12.0,15,6.8,3.7,1.10,2
193,35,Female,0.6,0.2,180.0,12.0,15,5.2,2.7,,2
198,18,Male,1.3,0.7,316.0,10.0,21,6.0,2.1,0.50,2


#### Data Preprocessing

In [7]:
# Effectively we can do that with this code (with Pandas):
train_mean = train_set.mean()
test_mean = test_set.mean()

print("Train Set: (Look at the Output) You can see the difference, as NaN values are replaced.")
train_set.fillna(train_mean).round(3) # Decimal values are rounded to 3 decimal places.

Train Set: (Look at the Output) You can see the difference, as NaN values are replaced.


Unnamed: 0,age,sex,total_bilirubin,direct_bilirubin,alkaline,alamine,aspartate,total_protein,albumin,A/G Ratio,label
18,26,Female,0.6,0.2,142.0,12.0,32,5.7,2.4,0.75,1
170,20,Male,1.1,0.5,128.0,20.0,30,3.9,1.9,0.95,2
107,22,Male,0.8,0.2,300.0,57.0,40,7.9,3.8,0.90,2
98,60,Male,19.6,9.5,466.0,46.0,52,6.1,2.0,0.40,1
177,28,Female,1.0,0.3,90.0,18.0,108,6.8,3.1,0.80,2
...,...,...,...,...,...,...,...,...,...,...,...
161,65,Female,0.7,0.2,406.0,24.0,45,7.2,3.5,0.90,2
23,61,Male,0.7,0.2,145.0,53.0,41,5.8,2.7,0.87,1
186,33,Male,1.6,0.5,165.0,15.0,23,7.3,3.5,0.92,2
135,32,Male,0.7,0.2,165.0,31.0,29,6.1,3.0,0.96,2


In [8]:
print("Test Set: (look at the Output) You can see the difference, as NaN values are replaced.")
test_set.fillna(test_mean).round(3) # Decimal values are rounded to 3 decimal places.

Test Set: (look at the Output) You can see the difference, as NaN values are replaced.


Unnamed: 0,age,sex,total_bilirubin,direct_bilirubin,alkaline,alamine,aspartate,total_protein,albumin,A/G Ratio,label
1,29,Male,1.0,0.3,75.0,25.0,26,5.1,2.9,1.300,1
3,45,Male,1.1,0.4,92.0,91.0,188,7.2,3.8,1.110,1
6,58,Male,0.4,0.1,100.0,59.0,126,4.3,2.5,1.400,1
9,35,Male,26.3,12.1,108.0,168.0,630,9.2,2.0,0.300,1
11,42,Male,0.8,0.2,127.0,29.0,30,4.9,2.7,1.200,1
...,...,...,...,...,...,...,...,...,...,...,...
190,84,Female,0.7,0.2,188.0,13.0,21,6.0,3.2,1.100,2
192,29,Male,0.8,0.2,156.0,12.0,15,6.8,3.7,1.100,2
193,35,Female,0.6,0.2,180.0,12.0,15,5.2,2.7,0.973,2
198,18,Male,1.3,0.7,316.0,10.0,21,6.0,2.1,0.500,2


## Decision Tree Classifier

In [9]:
class DecisionTreeClassifier:
    
    
    def __init__(self, tree):
        self.tree = DecisionTreeClassifier.build_tree(tree)
        
    def unique_vals(rows, col):
        # Used to find the unique values for "a" column in a dataset
        return set([row[col] for row in rows])
    
    def unique_label(rows):
        # Used to find the unique values for classification_label, note that there is only one column for label. # Rasyid
        return set([row for row in rows])
    
    def class_counts(rows):
        # Used for dataset array. Returns a dictionary of label -> count.
        counts = {}
        for row in rows:
            # in our dataset format, the label is always the last column
            label = row[-1]
            if label not in counts:
                counts[label] = 0
            counts[label] += 1
        return counts
        
    def partition(rows, question):
        # Used to split a dataset into true set and false set.
        true_rows, false_rows = [], []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
        
    def gini(rows):
        # Used to count the impurity.
        counts = DecisionTreeClassifier.class_counts(rows)
        impurity = 1
        for lbl in counts:
            prob_of_lbl = counts[lbl] / float(len(rows))
            impurity -= prob_of_lbl**2
        return impurity
    
    def info_gain(left, right, current_uncertainty):
        p = float(len(left)) / (len(left) + len(right))
        return current_uncertainty - p * DecisionTreeClassifier.gini(left) - (1 - p) * DecisionTreeClassifier.gini(right)
    
    def find_best_split(rows):
        best_gain = 0  # keep track of the best information gain
        best_question = None  # keep train of the feature / value that produced it
        current_uncertainty = DecisionTreeClassifier.gini(rows)
        n_features = len(rows[0]) - 1  # number of columns

        for col in range(n_features):  # for each feature

            values = set([row[col] for row in rows])  # unique values in the column

            for val in values:  # for each value

                question = Question(col, val)

                # try splitting the dataset
                true_rows, false_rows = DecisionTreeClassifier.partition(rows, question)

                # Skip this split if it doesn't divide the dataset.
                if len(true_rows) == 0 or len(false_rows) == 0:
                    continue

                # Calculate the information gain from this split
                gain = DecisionTreeClassifier.info_gain(true_rows, false_rows, current_uncertainty)

                if gain >= best_gain:
                    best_gain, best_question = gain, question
        return best_gain, best_question
    
    def build_tree(rows):
        gain, question = DecisionTreeClassifier.find_best_split(rows)

        if gain == 0:
            return Leaf(rows)

        # If we reach here, we have found a useful feature / value to partition on.
        true_rows, false_rows = DecisionTreeClassifier.partition(rows, question)

        # Recursively build the true branch.
        true_branch = DecisionTreeClassifier.build_tree(true_rows)

        # Recursively build the false branch.
        false_branch = DecisionTreeClassifier.build_tree(false_rows)

        return Decision_Node(question, true_branch, false_branch)

#### Question class: basically the decision node

In [10]:
class Question:
    
    
    # Question Class
    def __init__(self, column, value):
        self.column = column
        self.value = value
        
    def is_numeric(value):
        # To test if a value is numeric.
        return isinstance(value, int) or isinstance(value, float)
    
    def match(self, example):
        # Compare the feature value in an example to the feature value in this question.
        val = example[self.column]
        if Question.is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print the question in a readable format.
        condition = "=="
        if Question.is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

#### Leaf class for predictions

In [11]:
class Leaf:
    
    
    def __init__(self, rows):
        self.predictions = DecisionTreeClassifier.class_counts(rows)
    
    def print_leaf(counts):
        total = sum(counts.values()) * 1.0
        probs = {}
        for lbl in counts.keys():
            probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
        return probs

#### Decision Nodes Class, basically recursively returns every node except leaves.

In [12]:
class Decision_Node:
    
    
    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
        
    def print_tree(node, spacing=""):
        # Base case: we've reached a leaf
        if isinstance(node, Leaf):
            print (spacing + "Predict", node.predictions)
            return

        # Print the question at this node
        print (spacing + str(node.question))

        # Call this function recursively on the true branch
        print (spacing + '--> True:')
        Decision_Node.print_tree(node.true_branch, spacing + "  ")

        # Call this function recursively on the false branch
        print (spacing + '--> False:')
        Decision_Node.print_tree(node.false_branch, spacing + "  ")
        
    def classify(row, node):
        if isinstance(node,Leaf):
            return node.predictions

        if node.question.match(row):
            return Decision_Node.classify(row, node.true_branch)
        else:
            return Decision_Node.classify(row, node.false_branch)
    
    def testing_result(testing_dataset, tree):
        for row in testing_dataset:
            print("Actual: %s. Predicted: %s" %
                (row[-1], Leaf.print_leaf(Decision_Node.classify(row, tree))))

#### Building the tree

In [13]:
d_tree = DecisionTreeClassifier.build_tree(dataset_array)
print("Tree:\n")
Decision_Node.print_tree(d_tree)

Tree:



AttributeError: type object 'DecisionTreeClassifier' has no attribute 'is_numeric'

#### Create the training and test set Decision Tree.

In [None]:
train_set = train_set.to_numpy()
test_set_rows = test_set.to_numpy()

evaluation_tree = DecisionTreeClassifier.build_tree(train_set)

#### Evaluate

In [None]:
Decision_Node.testing_result(test_set_rows, evaluation_tree)

### Conclusion about this algorithm:
#### By the results shown, the machine learning model might be overfitted.