In [2]:
# import
import numpy as np
import pandas as pd
import math
import random
from numpy import sqrt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

## Basic
#### Section one
1. Entropy
> H(X) = -Sigma(p(x)*log(p(x)))
2. Information Gain
> IG = H(x)-Sigma(|xv/x|*H(xv))

3. Find the Best Split
4. Split the data into two branchs
5. Build the Decision Tree

#### Section two
1. split data
2. train by training data
3. prediction
4. F1-score
> F1-score = 2 * ((precision * recall / precision + recall))
5. Save answer

In [6]:
# load input data
input_data = pd.read_csv('lab2_basic_input.csv')

# basic value (can't modify)
max_depth = 2
depth = 0
min_samples_split = 2
n_features = input_data.shape[1] - 1
  # add own attribute(optimal)

# function count entropy
def entropy(data):
  p = 0
  n = 0

  ### START CODE HERE ###
  # Count the number of cases for each outcome (0 and 1)
  total = data.shape[0]  # Total number of instances
  p = data['hospital_death'].sum()  # Count of deaths (1s)
  n = total - p  # Count of no-deaths (0s)

  # Calculate probabilities for each outcome
  p1 = p / total  # Probability of death
  p2 = n / total  # Probability of survival

  # Compute entropy using the formula
  if p1 == 0 or p2 == 0:
      entropy_value = 0  # If one class is absent, entropy is 0
  else:
      entropy_value = -(p1 * math.log2(p1)) - (p2 * math.log2(p2))

  entropy_value = round(entropy_value, 4)
  ### END CODE HERE ###

  return entropy_value

# pass the check (V) ans_entropy = 0.9928
ans_entropy = entropy(input_data)
print("ans_entropy = ", ans_entropy)

ans_entropy =  0.9928


In [7]:
# count the information gain
def information_gain(data, mask):
  ### START CODE HERE ###
  left = data[mask]
  right = data[~mask]

  H_original = entropy(data)
  H_left = entropy(left)
  H_right = entropy(right)

  total_size = len(data)
  left_size = len(left)
  right_size = len(right)

  weighted_entropy = (left_size / total_size) * H_left + (right_size / total_size) * H_right

  # Information Gain = original entropy - weighted entropy
  ig = H_original - weighted_entropy
  ig = round(ig, 4)
  ### END CODE HERE ###

  return ig

# pass the check (V) ans_informationGain = 0.0385
temp1 = np.zeros((int(input_data.shape[0]/3), 1), dtype=bool)
temp2 = np.ones(((input_data.shape[0]-int(input_data.shape[0]/3), 1)), dtype=bool)
temp_mask = np.concatenate((temp1, temp2))
df_mask = pd.DataFrame(temp_mask, columns=['mask'])
ans_informationGain = information_gain(input_data, df_mask['mask'])
print("ans_informationGain = ", ans_informationGain)

ans_informationGain =  0.0385


In [8]:
# find the best split of the data
def find_best_split(data, impl_part):
  best_ig = -1e9
  best_threshold = 0
  best_feature = ''

  if impl_part == 'basic':
    # Implement the basic method using provided approach
    for feature in data.columns[:-1]:
      if feature == 'gender':
          mask = data['gender'] == 1
          ig = information_gain(data, mask)
          if ig > best_ig:
              best_ig = ig
              best_threshold = 'Gender Split'
              best_feature = feature

      else:
          # Handle continuous features
          sorted_data = data.sort_values(by=feature)
          for i in range(1, len(sorted_data)-1):
            if sorted_data[feature].iloc[i] == sorted_data[feature].iloc[i + 1]:
              continue
            else:
              # Compute the midpoint between consecutive values
              threshold = (sorted_data[feature].iloc[i] + sorted_data[feature].iloc[i + 1]) / 2
              mask = sorted_data[feature] <= threshold
              ig = information_gain(sorted_data, mask)
              # print("ig = ", ig)
              if ig > best_ig:
                  best_ig = ig
                  best_threshold = threshold
                  best_feature = feature
    ### END CODE HERE ###
  # else:
    ### START CODE HERE ###
    ### END CODE HERE ###

  return round(best_ig, 4), (round(best_threshold, 4) if isinstance(best_threshold, (int, float)) else best_threshold), best_feature



ans_ig, ans_value, ans_name = find_best_split(input_data, 'basic')
print("ans_ig = ", ans_ig)
print("ans_value = ", ans_value)
print("ans_name = ", ans_name)

ans_ig =  0.2146
ans_value =  99.5
ans_name =  glucose_apache


In [9]:
def make_partition(data, feature, threshold):
  ### START CODE HERE ###
  left = data[data[feature] <= threshold]
  right = data[data[feature] > threshold]
  ### END CODE HERE ###

  return left, right


# [Note] You have to save the value of "ans_left" into the output file
# Here, let's assume the best split is when we choose bmi as the feature and threshold as 21.0
left, right = make_partition(input_data, 'bmi', 21.0)
ans_left = left.shape[0]
print("ans_left = ", ans_left)

ans_left =  7


In [10]:
ans_features = []
ans_thresholds = []

def build_tree(data, max_depth, min_samples_split, depth):
  ### START CODE HERE ###
  if depth >= max_depth or len(data) < min_samples_split:
    label = data['hospital_death'].value_counts().idxmax()
    return label

  best_ig, threshold, feature = find_best_split(data, 'basic')

  # 檢查資訊增益是否大於 0
  if best_ig > 0:
      left, right = make_partition(data, feature, threshold)

      if len(left) == 0 or len(right) == 0:
          label = data['hospital_death'].value_counts().idxmax()
          return label
      else:
          question = "{} <= {}".format(feature, threshold)
          subtree = {question: []}

          # 遞迴地構建左子樹和右子樹
          left_subtree = build_tree(left, max_depth, min_samples_split, depth + 1)
          right_subtree = build_tree(right, max_depth, min_samples_split, depth + 1)

          if left_subtree == right_subtree:
              subtree = left_subtree
          else:
              subtree[question].append(left_subtree)
              subtree[question].append(right_subtree)
              ans_features.append(feature)
              ans_thresholds.append(threshold)
  else:
      label = data['hospital_death'].value_counts().idxmax()
      return label
  ### END CODE HERE ###

  return subtree


decisionTree = build_tree(input_data, max_depth, min_samples_split, depth)
print(decisionTree)
# [Note] You have to save the features in the "decisionTree" structure (from root to branch and leaf) into the output file
print(ans_features)
# [Note] You have to save the corresponding thresholds for the features in the "ans_features" list into the output file
print(ans_thresholds)


{'glucose_apache <= 99.5': [{'height <= 184.15': [0, 1]}, 1]}
['height', 'glucose_apache']
[184.15, 99.5]


In [24]:
def classify_data(instance, tree):
  equation = list(tree.keys())[0]
  if equation.split()[1] == '<=':
    temp_feature = equation.split()[0]
    temp_threshold = equation.split()[2]
    if instance[temp_feature] > float(temp_threshold):
      answer = tree[equation][1]
    else:
      answer = tree[equation][0]
  else:
    if instance[equation.split()[0]] in (equation.split()[2]):
      answer = tree[equation][0]
    else:
      answer = tree[equation][1]

  if not isinstance(answer, dict):
    return answer
  else:
    return classify_data(instance, answer)


def make_prediction(tree, data):
  ### START CODE HERE ###
  # [Note] You can call the function classify_data() to predict the label of each instance
  y_prediction = []

  for index, instance in data.iterrows():
      prediction = classify_data(instance, tree)
      y_prediction.append(prediction)
  ### END CODE HERE ###

  return y_prediction


def calculate_score(y_true, y_pred):
  score = f1_score(y_true, y_pred)
  score = round(score, 4)
  return score

In [22]:
basic = []
basic.append(ans_entropy)
basic.append(ans_informationGain)
basic.append([ans_ig, ans_value, ans_name])
basic.append(ans_left)
basic.append(ans_features + ans_thresholds)

In [23]:
num_train = 30
num_validation = 10

training_data = input_data.iloc[:num_train]
validation_data = input_data.iloc[-num_validation:]

y_train = training_data[['hospital_death']]
x_train = training_data.drop(['hospital_death'], axis=1)

y_validation = validation_data[['hospital_death']]
x_validation = validation_data.drop(['hospital_death'], axis=1)
y_validation = y_validation.values.flatten()

print(input_data.shape)
print(training_data.shape)
print(validation_data.shape)

max_depth = 2
depth = 0
min_samples_split = 2
n_features = x_train.shape[1]

(40, 11)
(30, 11)
(10, 11)


In [25]:
# decision_tree = build_tree(input_data, max_depth, min_samples_split, depth)
decision_tree = build_tree(training_data, max_depth, min_samples_split, depth)
y_pred = make_prediction(decision_tree, x_validation)

# [Note] You have to save the value of "ans_f1score" into your output file
# Please round your answer to 4 decimal place : ans_f1score = 0.4444
ans_f1score = calculate_score(y_validation, y_pred)
print("ans_f1score = ", ans_f1score)
# This is just for you to check your predictions : y_pred = [1, 1, 0, 1, 0, 0, 0, 0, 0, 1]
y_pred

ans_f1score =  0.4444


[1, 1, 0, 1, 0, 0, 0, 0, 0, 1]

In [26]:
basic.append(ans_f1score)
basic_path = 'lab2_basic.csv'

basic_df = pd.DataFrame({'Id': range(len(basic)), 'Ans': basic})
basic_df.set_index('Id', inplace=True)
basic_df

basic_df.to_csv(basic_path, header = True, index = True)