In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import tensorflow as tf

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Data location

In [3]:
data_path = "data/clean_data.csv"

# Set seed

In [4]:
SEED = 42

# Load data

In [5]:
df = pd.read_csv(data_path)

In [6]:
df.shape

(11627, 40)

In [7]:
df.columns

Index(['Unnamed: 0', 'RANDID', 'SEX', 'TOTCHOL', 'AGE', 'SYSBP', 'DIABP',
       'CURSMOKE', 'CIGPDAY', 'BMI', 'DIABETES', 'BPMEDS', 'HEARTRTE',
       'GLUCOSE', 'educ', 'PREVCHD', 'PREVAP', 'PREVMI', 'PREVSTRK', 'PREVHYP',
       'TIME', 'PERIOD', 'HDLC', 'LDLC', 'DEATH', 'ANGINA', 'HOSPMI',
       'MI_FCHD', 'ANYCHD', 'STROKE', 'CVD', 'HYPERTEN', 'TIMEAP', 'TIMEMI',
       'TIMEMIFC', 'TIMECHD', 'TIMESTRK', 'TIMECVD', 'TIMEDTH', 'TIMEHYP'],
      dtype='object')

In [8]:
df.iloc[:, 2: 24].columns

Index(['SEX', 'TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'CURSMOKE', 'CIGPDAY', 'BMI',
       'DIABETES', 'BPMEDS', 'HEARTRTE', 'GLUCOSE', 'educ', 'PREVCHD',
       'PREVAP', 'PREVMI', 'PREVSTRK', 'PREVHYP', 'TIME', 'PERIOD', 'HDLC',
       'LDLC'],
      dtype='object')

In [9]:
df.iloc[:, 24: 32].columns

Index(['DEATH', 'ANGINA', 'HOSPMI', 'MI_FCHD', 'ANYCHD', 'STROKE', 'CVD',
       'HYPERTEN'],
      dtype='object')

In [10]:
print(df.iloc[:, 2: 24].shape[1], df.iloc[:, 24: 32].shape[1])

22 8


data = np.loadtxt(data_path, delimiter=",",dtype=float, skiprows=1)
amax_label = np.amax(data[:, 24:32], axis=1)
hist,bins = np.histogram(amax_label) 

plt.hist(amax_label, bins = bins) 
plt.title("any disease") 
plt.show()

# Node class

In [17]:
class Node():
    def __init__(self, left=None, right=None, information_gain=None,
                 threshold=None, column_index=None, value=None):
        
        #children
        self.left = left  
        self.right = right
        
        #decision nood
        self.information_gain = information_gain
        self.threshold = threshold
        self.column_index = column_index
        
        #leef nood
        self.value = value

# Tree class

In [32]:
class DecisionTree():
    def __init__(self, min_participant=2, max_depth=2):
        self.root = None
        
        self.min_participant = min_participant
        self.max_depth = max_depth
        
    def split_input_label(self, dataset):
        return data[:, :22], data[:, 22:]

    def build_tree(self, X, Y, depth=0):
        count_participant, _ = np.shape(X)
        
        dataset = np.concatenate((X, Y), axis=1)
        
        if count_participant >= self.min_participant and depth <= self.max_depth:
            split = self.get_split_value(dataset)
            if split['information_gain'] > 0 :
                left = self.build_tree(split['left'], depth=depth + 1)
                right = self.build_tree(split['right'], depth=depth + 1)

                return Node(left, right, split['information_gain'],
                            split['threshold'], split['column_index'])
        
        leaf_value = self.compute_leaf_value(Y)
        return Node(value=leaf_value)
    
    def get_split_value(self, dataset):
        max_gain = float('-inf')
        split = {}
        
        columns = dataset.shape[1]

        #for i in range(columns):
        #print(dataset.names)
        for i in range(columns):
            col = dataset[:, i]
            possible_thresholds = np.unique(col)
            
            for threshold in possible_thresholds:
                left, right = self.split(dataset, threshold)
                
                gain = self.compute_information_gain(dataset, left, right)
                if gain > max_gain:
                    max_gain = gain
                    split['information_gain'] = gain
                    split['threshold'] = threshold
                    split['left'] = left
                    split['right'] = right
                    split['column_index'] = i
        return split
    
    def split(self, dataset, threshold):
        left = dataset[np.where(dataset[:, 0] == 0)]
        right = dataset[np.where(dataset[:, 0] == 1)]
        
        return left, right
    
    def compute_information_gain(self, dataset, left, right):
        inputs, labels = self.split_input_label(dataset)
        left_inputs, left_labels = self.split_input_label(dataset)
        right_inputs, right_labels = self.split_input_label(dataset)
        
        weight_left = len(left) / len(dataset)
        weight_right = len(right) / len(dataset)
        
        childs_entropy = self.entropy(left_labels) * weight_left - self.entropy(right_labels) * weight_right
        information_gain = self.entropy(dataset) - childs_entropy
        
        return information_gain
    
    def entropy(self, labels):
        res = 0
        
        for value in [0, 1]: #possible values
            value_proportion = len(labels[labels == value]) / len(labels)
            res += -value_proportion * np.log2(value_proportion)
        return res
    
    def compute_leaf_value(self, label):
        values, counts = np.unique(label, return_counts=True)
        index = np.argmax(counts)
        return values[index]
        
    def fit(self, X, Y):
        self.root = self.build_tree(X, Y)
        
    def evaluate(self, x, decision_tree):
        #leaf
        if decision_tree.value != None:
            return decision_tree.value

        x_column_value = x[decision_tree.column_index]

        #print(x_column_value)
        #print(decision_tree.threshold)
        if x_column_value <= decision_tree.threshold:
            return self.evaluate(x, decision_tree.left)
        return self.evaluate(x, decision_tree.right)
    
    def predict(self, X):
        return [self.evaluate(x, self.root) for x in X]
    
    def predict_one_element(self, X):
        return [self.evaluate(x, self.root)]

# Split data set into train, validation and test

In [51]:
data = np.loadtxt(data_path, delimiter=",",dtype=float, skiprows=1)
X = data[:, 2:24]
Y = data[:, 30:31]
print(Y)
print(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

[[1.]
 [1.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
[[  1. 195.  39. ...   1.  45. 191.]
 [  1. 209.  52. ...   3.  31. 178.]
 [  0. 250.  46. ...   1.  45. 191.]
 ...
 [  0. 196.  39. ...   1.  52. 166.]
 [  0. 240.  46. ...   2.  52. 166.]
 [  0. 189.  50. ...   3.  52. 166.]]


# Train

In [52]:
decision_tree = DecisionTree(max_depth=20)
decision_tree.fit(X_train, Y_train)

# Test

In [53]:
Y_pred = decision_tree.predict(X_test) 

print(Y_pred)
print(Y_test.flatten())
accuracy_score(Y_test.flatten(), Y_pred)


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

0.7545141874462596