In [2]:
import numpy as np 
import pandas as pd
import math

In [3]:
cardio_train_df = pd.read_csv('/content/cardio_train.csv', sep= ';')
cardio_train_df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [4]:
X = cardio_train_df.iloc[:,:-1]
y = cardio_train_df.iloc[:, -1]
gender = cardio_train_df.iloc[:, 2]
gender_feature_value = cardio_train_df[cardio_train_df['gender'] == 1]
gender_feature_value

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69993,99991,19699,1,172,70.0,130,90,1,1,0,0,1,1
69994,99992,21074,1,165,80.0,150,80,1,1,0,0,1,1
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [5]:
# calculate the total entropy for training examples
def calc_total_entropy(input_dataframe, target_prediction, target_unique_values):
  total_rows = input_dataframe.shape[0]
  total_entropy = 0
  for target_unique_value in target_unique_values:
    class_freq = input_dataframe[input_dataframe[target_prediction]==target_unique_value].shape[0]
    total_entropy += - (class_freq/total_rows)* np.log2(class_freq/total_rows)
  return total_entropy

In [6]:
#calc entropy for a specfic feature unique value
def calc_entropy(feature_value_dataframe, target_prediction, target_unique_values):
    total_rows = feature_value_dataframe.shape[0]
    entropy = 0
    for target_unique_value in target_unique_values:
        label_class_count = feature_value_dataframe[feature_value_dataframe[target_prediction] == target_unique_value].shape[0] 
        entropy_class = 0
        if label_class_count != 0:
            probability_class = label_class_count/total_rows 
            entropy_class = - probability_class * np.log2(probability_class) 
        entropy += entropy_class
    return entropy

In [18]:
#the function calculates the information gain for a feature
def calc_information_gain(input_dataframe, feature_name, target_prediction, target_unique_values):
  total_rows = input_dataframe.shape[0]
  total_entropy = calc_total_entropy(input_dataframe, target_prediction, target_unique_values)
  unique_feature_values = np.unique(input_dataframe[feature_name], return_counts = True)[0]
  feature_value_entropy = []
  feature_value_probability = []
  for unique_feature_value in unique_feature_values:
    feature_value_dataframe = input_dataframe[input_dataframe[feature_name] == unique_feature_value]
    feature_value_entropy.append(calc_entropy(feature_value_dataframe, target_prediction, target_unique_values))
    feature_value_probability.append(feature_value_dataframe.shape[0]/total_rows)
  feature_entopy = 0
  for i in range(len(feature_value_entropy)):
    feature_entopy += -feature_value_entropy[i]* feature_value_probability[i]
  total_gain = total_entropy + feature_entopy
  return total_gain

In [20]:
def highest_info_gain_feature(input_dataframe, target_prediction, target_unique_values):
  feature_list = input_dataframe.columns.drop(target_prediction)                              
  max_info_gain = -1
  max_info_feature = None
  for feature in feature_list:  #for each feature in the dataset
        feature_info_gain = calc_information_gain(input_dataframe, feature, target_prediction, target_unique_values)
        if max_info_gain < feature_info_gain: 
            max_info_gain = feature_info_gain
            max_info_feature = feature
  return max_info_feature

In [30]:
def generate_sub_tree(input_dataframe, feature_name, target_prediction, target_unique_values):
  feature_list = input_dataframe[feature_name]
  feature_unique_values = np.unique(feature_list, return_counts = True)[0]#unique values that specific feature has
  feature_values_unique_counts = np.unique(feature_list, return_counts = True)[1]#count of each unique value that specific feature has
  tree = {}
  for i, j in zip(feature_unique_values, feature_values_unique_counts):
    assigned_to_node = False #flag for tracking feature_value is pure class or not
    feature_value_dataframe = input_dataframe[input_dataframe[feature_name] == i]
    for tar_pred in target_prediction:
      tar_pred_count = feature_value_dataframe[feature_value_dataframe[target_prediction] == tar_pred].shape[0]
      if j == tar_pred_count:#is the feature value count equal to the count of only one class of the target_prediction(is that feature value pure)
        tree[i] = tar_pred 
        input_dataframe = input_dataframe[input_dataframe[feature_name] != i]
        assigned_to_node = True
      if not assigned_to_node:
        tree[i] = "?"#adding an expandable node to the tree
  return tree, input_dataframe

In [None]:
def generate_tree(root, prev_feature_value, input_dataframe, target_prediction, target_unique_values):
  if input_dataframe.shape[0] != 0:
    max_info_gain_feature = highest_info_gain_feature(input_dataframe, target_prediction, target_unique_values)
    tree, input_dataframe = generate_sub_tree(input_dataframe, max_info_gain_feature, target_prediction, target_unique_values)
    