<a href="https://colab.research.google.com/github/munawwar22HU/CS-351-AI-Project/blob/main/decision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [265]:
## For reference, this is the goal to achieve
# df = pd.read_csv("data.csv")
# train_df, test_df = train_test_split(df, test_size_proportion=0.2)
# tree = decision_tree_algorithm(train_df)
# accuracy = calculate_accuracy(test_df, tree)

In [266]:
## Importing required modules
 
import numpy as np 
import pandas as pd
 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
 
import random
from pprint import pprint


In [267]:
df = pd.read_csv("/content/HeartData.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,2,1.0,1.0,3,1,1.0,2.0,2,0.0,1,3.0,0.0,6.0,1
1,2,1.0,4.0,3,1,0.0,2.0,1,1.0,0,2.0,3.0,3.0,0
2,2,1.0,4.0,1,1,0.0,2.0,1,1.0,1,2.0,2.0,7.0,0
3,0,1.0,3.0,2,1,0.0,0.0,2,0.0,1,3.0,0.0,3.0,1
4,1,0.0,2.0,2,1,0.0,2.0,2,0.0,0,1.0,0.0,3.0,1



# Train - Test - Split


In [268]:
## Train-Test-Split
def train_test_split(df, test_size):

    # this checks whether the test_size is an integer or a proportion (float)
    # it is is a float, we convert it to an integer first
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    # print(indices)
    test_indices = random.sample(population=indices, k=test_size)
    test_df = df.loc[test_indices]  # this allows us to access only certain data rows (in this case the test_indices rows)
    train_df = df.drop(test_indices) # to remove the test dataframe from the training dataframe
    return train_df, test_df

In [269]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.3)
# test_df.head()

In [270]:
len(test_df)

90

In [271]:
len(train_df)

209

# Check Purity of Data

In [272]:
# this function will be used to check the purity of data
def check_purity(data):
    label_column = data[:, -1] # from the data 2d array, access only the last column (label) of every row
    unique_classes = np.unique(label_column) # from label_column, identify and choose all the unique columns
    # returns true if the data is pure, that means it contains only one class
    if len(unique_classes) == 1:
        return True
    # otherwise it returns false
    else: 
        return False

In [273]:
check_purity(train_df.values)

False

#Classify Data

In [274]:
def classify_data(data):
    label_column = data[:, -1]
    # this returns the unique classes along with the count of how many times each class appears in the data using return_counts
    classes, count_classes = np.unique(label_column, return_counts=True)

    index = count_classes.argmax()
    classification = classes[index]

    return classification

In [275]:
classes,count_Classes = np.unique(train_df.label,return_counts=True)
print(classes)
print(count_Classes)
classify_data(train_df.values)


[0 1]
[103 106]


1.0

#Get Potential Splits

In [276]:
### Potential Splits
def get_potential_splits(data):

    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):          # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits


In [277]:
potential_splits = get_potential_splits(train_df.values)
pprint(potential_splits)

{0: array([0., 1., 2.]),
 1: array([0., 1.]),
 2: array([1., 2., 3., 4.]),
 3: array([0., 1., 2., 3., 4.]),
 4: array([0., 1.]),
 5: array([0., 1.]),
 6: array([0., 1., 2.]),
 7: array([0., 1., 2.]),
 8: array([0., 1.]),
 9: array([0., 1., 2.]),
 10: array([1., 2., 3.]),
 11: array([0., 1., 2., 3.]),
 12: array([3., 6., 7.])}


# Split Data

In [278]:
### Split data
def split_data(data, split_column, split_value):

    split_column_values = data[:,split_column]
    type_of_feature = feature_types[split_column]
    if type_of_feature == "continuous":
      data_below = data[split_column_values<= split_value]
      data_above = data[split_column_values>split_value]
    else:
      data_below = data[split_column_values == split_value]
      data_above = data[split_column_values !=split_value]
  
    return data_below, data_above



In [279]:
data_below,data_above = split_data(train_df.values,12,3)
pprint(np.unique(data_below[:,12]))
pprint(np.unique(data_above[:,12]))
pprint(len(data_below))
pprint(len(data_above))

array([3.])
array([6., 7.])
110
99


In [280]:
data_below,data_above = split_data(train_df.values,12,6)
pprint(np.unique(data_below[:,12]))
pprint(np.unique(data_above[:,12]))
pprint(len(data_below))
pprint(len(data_above))

array([6.])
array([3., 7.])
10
199


# Lowest Entropy Overall

In [281]:
def calculate_entropy(data):
  label_column = data[:,-1]
  _, counts = np.unique (label_column,return_counts = True)
  #print(counts)
  probabilities = counts/counts.sum()
  #print(probabilities)
  entropy = -sum(probabilities *np.log2(probabilities))
  #print(entropy)
  return entropy

In [282]:
def calculate_overall_entropy(data_below,data_above):
    n_data_points = len(data_below)+len(data_above)
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_entropy = (p_data_below*calculate_entropy(data_below) + p_data_above * calculate_entropy(data_above))


    return overall_entropy

In [283]:
entropy = calculate_overall_entropy(data_below=data_below,data_above=data_above)
pprint(entropy)

0.985289794061943


#Lowest GINI Index value

In [284]:
def calculate_gini(data):
  label_column = data[:,-1]
  _,counts = np.unique (label_column,return_counts = True)
  #print(counts)
  probabilities = counts/counts.sum()
  #print(probabilities)
  gini = 1 - sum(probabilities**2)
  #print(gini)
  return gini


In [285]:
def calculate_overall_gini(data_below,data_above):
    n_data_points = len(data_below)+len(data_above)
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_gini = (p_data_below*calculate_gini(data_below) + p_data_above * calculate_gini(data_above))


    return overall_gini

In [286]:
gini = calculate_overall_gini(data_below=data_below,data_above=data_above)
pprint(gini)

0.4904137914452645


In [287]:
def determine_best_split(data,potential_splits,use_entropy = True):
  
  #potential_splits  = get_potential_splits(data)
  if (use_entropy == True):
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            
            if current_overall_entropy <=overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
  else:
    overall_gini = 9999
    for column_index in potential_splits:
      for value in potential_splits[column_index]:
          data_below, data_above = split_data(data, split_column=column_index, split_value=value)
          current_overall_gini = calculate_overall_gini(data_below, data_above)
          
          if current_overall_gini <=overall_gini:
              overall_gini = current_overall_gini
              best_split_column = column_index
              best_split_value = value


  return best_split_column,best_split_value

In [288]:
split_column,split_value = determine_best_split(train_df.values,potential_splits,use_entropy = True)
pprint(split_column)
pprint(split_value)



12
3.0


#Determine Type of Feature

In [289]:
def determine_type_of_feature(df):
  feature_types = []
  n_unique_values_threshold = 15
  for column in df.columns:
    
    unique_values = df[column].unique()
    example_Value = unique_values[0]
    if (isinstance(example_Value,str) or len(unique_values)<=n_unique_values_threshold):
      feature_types.append("categorical")
    else:
      feature_types.append("continuous")
  return feature_types

In [290]:
feature_types = determine_type_of_feature(train_df)
pprint(feature_types)

['categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical',
 'categorical']


# Decision Tree Algorithm


In [291]:
#sub_tree = {question : [yes_answer,no_answer]}

In [292]:
def decision_tree_algorithm(df,counter = 0,min_samples = 2,max_depth = 5,use_entropy = True ):
  if counter == 0:
    global column_headers,feature_types
    column_headers = df.columns
    feature_types = determine_type_of_feature(df)
    data = df.values
  else:
    data = df
  # base case 
  if (check_purity(data) or (len(data)<min_samples) or (counter == max_depth)):
    classification  = classify_data(data)
    return classification
  # recursive part
  else:
      counter +=1 
      potential_splits = get_potential_splits(data)
      split_column,split_value = determine_best_split(data,potential_splits,use_entropy)
      data_below, data_above = split_data(data, split_column,split_value)
      if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            return classification
      # instantiate sub tree
      feature_name = column_headers[split_column]
      type_of_feature = feature_types[split_column]
      if type_of_feature == "continuous":
        question = "{} <= {}".format(feature_name,split_value)
      else:
        question = "{} = {}".format(feature_name,split_value)
      sub_tree = {question: []}

      # find answers 
      yes_answer  =  decision_tree_algorithm(data_below,counter,min_samples,max_depth)
      no_answer = decision_tree_algorithm(data_above,counter,min_samples,max_depth)

      if yes_answer == no_answer:
          sub_tree = yes_answer
      else:
        sub_tree[question].append(yes_answer)
        sub_tree[question].append(no_answer)

      return sub_tree




# Classification

In [293]:
def classify_example(example,tree):
  
  question = list(tree.keys())[0]
  feature_name,comparision_operator,value = question.split()
  # ask question
  if comparision_operator == "<=":
    if example[feature_name]<=float(value):
      answer =  tree[question][0]
    else:
      answer =  tree[question][1]
  else:
      if str(example[feature_name])==value:
        answer =  tree[question][0]
      else:
        answer =  tree[question][1]

  # Base Case
  if not isinstance(answer,dict):
    return answer
  # Recursive part
  else:
    residual_tree = answer
    return classify_example(example,residual_tree)


# Accuracy


In [294]:
def evaluate(df,tree):
  df['classification'] = df.apply(classify_example,axis=1,args=(tree,))
  df['classification_correct'] = df.classification == df.label
  classes = np.unique(df.label)
  n = len(classes)
  confusion_matrix = np.zeros((n,n))
  for i in range(n):
    for j in range(n):
      confusion_matrix[i][j]= sum(np.logical_and(df.label == classes[i],df.classification== classes[j]))
  print("Confusion Matrix ")
  pprint(confusion_matrix,width=50)
  accuracy = df.classification_correct.mean()
  precision = np.sum(np.diag(confusion_matrix)) / np.sum(np.sum(confusion_matrix, axis = 0))
  recall = np.diag(confusion_matrix)/ np.sum(confusion_matrix,axis=1)
  print("Accuracy : ",accuracy)
  print("Recall Score Classwise : " ,recall)
  print("Precision : ",precision)
  



In [295]:
test_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
197,1,0.0,4.0,0,1,0.0,2.0,2,0.0,0,1.0,0.0,3.0,1
215,2,0.0,4.0,2,1,0.0,0.0,1,0.0,1,2.0,2.0,3.0,1
20,2,1.0,1.0,0,1,0.0,2.0,1,1.0,0,2.0,0.0,3.0,1
132,1,1.0,4.0,3,1,0.0,2.0,2,1.0,0,1.0,0.0,3.0,1
261,2,1.0,4.0,2,0,0.0,2.0,1,1.0,1,2.0,1.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,2,0.0,3.0,3,1,0.0,1.0,1,0.0,0,2.0,0.0,3.0,1
263,1,1.0,3.0,1,1,1.0,0.0,1,0.0,1,2.0,1.0,6.0,0
283,1,1.0,2.0,2,1,0.0,2.0,2,0.0,0,1.0,0.0,7.0,1
210,2,0.0,4.0,3,1,1.0,0.0,2,1.0,0,2.0,2.0,7.0,0


In [296]:
random.seed(0)
train_df,test_df = train_test_split(df,test_size=0.3)
tree = decision_tree_algorithm(train_df,max_depth=3,use_entropy=False)
evaluate(test_df,tree)
pprint(tree,width=50)

Confusion Matrix 
array([[27.,  9.],
       [ 5., 49.]])
Accuracy :  0.8444444444444444
Recall Score Classwise :  [0.75       0.90740741]
Precision :  0.8444444444444444
{'thal = 3.0': [{'ca = 0.0': [1.0,
                              {'cp = 4.0': [0.0,
                                            1.0]}]},
                {'ca = 0.0': [{'exang = 1.0': [0.0,
                                               1.0]},
                              0.0]}]}
