In [96]:
# import
import numpy as np
import pandas as pd
import math
import random
from numpy import sqrt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [97]:
advanced_training_data = pd.read_csv('lab2_advanced_training.csv')
# advanced_training_data
advanced_testing_data = pd.read_csv('lab2_advanced_testing.csv')
# advanced_testing_data

#### 前處理

In [98]:
# 檢查哪些欄位有 NaN
nan_columns = advanced_training_data.columns[advanced_training_data.isna().any()]
print("Columns with NaN values:", nan_columns)

# 檢查每列中的 NaN 數量
nan_rows = advanced_training_data[advanced_training_data.isna().any(axis=1)]
print("Rows with NaN values:\n", nan_rows)

Columns with NaN values: Index([], dtype='object')
Rows with NaN values:
 Empty DataFrame
Columns: [age, bmi, gender, height, weight, pre_icu_los_days, arf_apache, bun_apache, creatinine_apache, gcs_eyes_apache, gcs_motor_apache, gcs_unable_apache, gcs_verbal_apache, glucose_apache, heart_rate_apache, hematocrit_apache, intubated_apache, map_apache, resprate_apache, sodium_apache, temp_apache, ventilated_apache, wbc_apache, apache_4a_hospital_death_prob, apache_4a_icu_death_prob, aids, cirrhosis, diabetes_mellitus, leukemia, hospital_death]
Index: []

[0 rows x 30 columns]


In [99]:
# 檢查 advanced_training_data 每個特徵的屬性
print(advanced_training_data.info())  # 檢查數據類型和非空數量

# 如果你想查看數值特徵的基本統計信息，可以使用 describe()
print(advanced_training_data.describe())  # 基本統計量：均值、標準差、最小值、最大值等

# 檢查每個特徵的唯一值數量
unique_counts = advanced_training_data.nunique()

# 篩選出只有 0 和 1 兩種唯一值的特徵
binary_features = unique_counts[unique_counts == 2].index

# 列出這些特徵
print("Features with only 0 and 1 values:")
print(advanced_training_data[binary_features].head())  # 查看這些特徵的前幾行資料

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            8500 non-null   float64
 1   bmi                            8500 non-null   float64
 2   gender                         8500 non-null   int64  
 3   height                         8500 non-null   float64
 4   weight                         8500 non-null   float64
 5   pre_icu_los_days               8500 non-null   float64
 6   arf_apache                     8500 non-null   float64
 7   bun_apache                     8500 non-null   float64
 8   creatinine_apache              8500 non-null   float64
 9   gcs_eyes_apache                8500 non-null   float64
 10  gcs_motor_apache               8500 non-null   float64
 11  gcs_unable_apache              8500 non-null   float64
 12  gcs_verbal_apache              8500 non-null   f

In [100]:
# 檢查是否還有 NaN 值
if advanced_training_data.isna().sum().sum() == 0:
    print("No more NaN values.")
else:
    print("There are still NaN values.")

No more NaN values.


In [101]:
# 刪除任何有空缺資料的行
advanced_training_data_clean = advanced_training_data.dropna()

# 檢查刪除後的資料形狀
print("Shape after removing rows with missing data:", advanced_training_data_clean.shape)

# 刪除指定的特徵資料
features_to_drop = ['gender', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'aids', 'cirrhosis', 'diabetes_mellitus', 'leukemia']
advanced_training_data_clean = advanced_training_data.drop(columns=features_to_drop)

# 檢查刪除後的資料形狀
print("Shape after removing specified features:", advanced_training_data_clean.shape)

# 刪除指定的特徵資料
features_to_drop = ['gender', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'aids', 'cirrhosis', 'diabetes_mellitus', 'leukemia']
advanced_testing_data_clean = advanced_testing_data.drop(columns=features_to_drop)

# 檢查刪除後的資料形狀
print("Shape after removing specified features from testing data:", advanced_testing_data_clean.shape)


Shape after removing rows with missing data: (8500, 30)
Shape after removing specified features: (8500, 22)
Shape after removing specified features from testing data: (900, 21)


#### 建構Advance

In [102]:
### START CODE HERE ###
num_train = 8000
num_validation = 500

training_data = advanced_training_data_clean.iloc[:num_train]
validation_data = advanced_training_data_clean.iloc[-num_validation:]

y_train = training_data[['hospital_death']]
x_train = training_data.drop(['hospital_death'], axis=1)

y_validation = validation_data[['hospital_death']]
x_validation = validation_data.drop(['hospital_death'], axis=1)
y_validation = y_validation.values.flatten()

print(advanced_training_data.shape)
print(training_data.shape)
print(validation_data.shape)

max_depth = 10
depth = 0
min_samples_split = 5
n_trees = 100
n_features = int(np.sqrt(training_data.shape[1] - 1))
sample_size = 0.8
n_samples = int(training_data.shape[0] * sample_size)
### END CODE HERE ###

(8500, 30)
(8000, 22)
(500, 22)


#### Basic Function

In [123]:
def classify_data(instance, tree):
  equation = list(tree.keys())[0]
  if equation.split()[1] == '<=':
    temp_feature = equation.split()[0]
    temp_threshold = equation.split()[2]
    if instance[temp_feature] > float(temp_threshold):
      answer = tree[equation][1]
    else:
      answer = tree[equation][0]
  else:
    if instance[equation.split()[0]] in (equation.split()[2]):
      answer = tree[equation][0]
    else:
      answer = tree[equation][1]

  if not isinstance(answer, dict):
    return answer
  else:
    return classify_data(instance, answer)


def make_prediction(tree, data):
  ### START CODE HERE ###
  # [Note] You can call the function classify_data() to predict the label of each instance
  y_prediction = []

  for index, instance in data.iterrows():
      prediction = classify_data(instance, tree)
      y_prediction.append(prediction)
  ### END CODE HERE ###

  return y_prediction


def calculate_score(y_true, y_pred):
  score = f1_score(y_true, y_pred)
  score = round(score, 4)
  return score

In [116]:
def make_partition(data, feature, threshold):
  ### START CODE HERE ###
  left = data[data[feature] <= threshold]
  right = data[data[feature] > threshold]
  ### END CODE HERE ###

  return left, right

In [117]:
# function count entropy
def entropy(data):
  p = 0
  n = 0

  ### START CODE HERE ###
  # Count the number of cases for each outcome (0 and 1)
  total = data.shape[0]  # Total number of instances
  p = data['hospital_death'].sum()  # Count of deaths (1s)
  n = total - p  # Count of no-deaths (0s)

  # Calculate probabilities for each outcome
  p1 = p / total  # Probability of death
  p2 = n / total  # Probability of survival

  # Compute entropy using the formula
  if p1 == 0 or p2 == 0:
      entropy_value = 0  # If one class is absent, entropy is 0
  else:
      entropy_value = -(p1 * math.log2(p1)) - (p2 * math.log2(p2))

  entropy_value = round(entropy_value, 4)
  ### END CODE HERE ###

  return entropy_value

In [118]:
# count the information gain
def information_gain(data, mask):
  ### START CODE HERE ###
  left = data[mask]
  right = data[~mask]

  H_original = entropy(data)
  H_left = entropy(left)
  H_right = entropy(right)

  total_size = len(data)
  left_size = len(left)
  right_size = len(right)

  weighted_entropy = (left_size / total_size) * H_left + (right_size / total_size) * H_right

  # Information Gain = original entropy - weighted entropy
  ig = H_original - weighted_entropy
  ig = round(ig, 4)
  ### END CODE HERE ###

  return ig


In [119]:
# find the best split of the data
def find_best_split(data, impl_part):
  best_ig = -1e9
  best_threshold = 0
  best_feature = ''

  if impl_part == 'basic':
    # Implement the basic method using provided approach
    for feature in data.columns[:-1]:
      if feature == 'gender':
          mask = data['gender'] == 1
          ig = information_gain(data, mask)
          if ig > best_ig:
              best_ig = ig
              best_threshold = 'Gender Split'
              best_feature = feature

      else:
          # Handle continuous features
          sorted_data = data.sort_values(by=feature)
          for i in range(1, len(sorted_data)-1):
            if sorted_data[feature].iloc[i] == sorted_data[feature].iloc[i + 1]:
              continue
            else:
              # Compute the midpoint between consecutive values
              threshold = (sorted_data[feature].iloc[i] + sorted_data[feature].iloc[i + 1]) / 2
              mask = sorted_data[feature] <= threshold
              ig = information_gain(sorted_data, mask)
              # print("ig = ", ig)
              if ig > best_ig:
                  best_ig = ig
                  best_threshold = threshold
                  best_feature = feature
    ### END CODE HERE ###
  else:
    # Implement the basic method using provided approach
    for feature in data.columns[:-1]:
      if feature in ['gender', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'aids', 'cirrhosis', 'diabetes_mellitus', 'leukemia']: #  arf_apache gcs_unable_apache intubated_apache ventilated_apache aids cirrhosis diabetes_mellitus leukemia
          mask = data['gender'] == 1
          ig = information_gain(data, mask)
          if ig > best_ig:
              best_ig = ig
              best_threshold = f'{feature} Split'
              best_feature = feature

      else:
          # Handle continuous features
          sorted_data = data.sort_values(by=feature)
          for i in range(1, len(sorted_data)-1):
            if sorted_data[feature].iloc[i] == sorted_data[feature].iloc[i + 1]:
              continue
            else:
              # Compute the midpoint between consecutive values
              threshold = (sorted_data[feature].iloc[i] + sorted_data[feature].iloc[i + 1]) / 2
              mask = sorted_data[feature] <= threshold
              ig = information_gain(sorted_data, mask)
              # print("ig = ", ig)
              if ig > best_ig:
                  best_ig = ig
                  best_threshold = threshold
                  best_feature = feature
    ### START CODE HERE ###
    ### END CODE HERE ###

  return round(best_ig, 4), (round(best_threshold, 4) if isinstance(best_threshold, (int, float)) else best_threshold), best_feature

In [120]:
ans_features = []
ans_thresholds = []

def build_tree(data, max_depth, min_samples_split, depth):
  ### START CODE HERE ###
  if depth >= max_depth or len(data) < min_samples_split:
    label = data['hospital_death'].value_counts().idxmax()
    return label

  best_ig, threshold, feature = find_best_split(data, 'basic')

  # 檢查資訊增益是否大於 0
  if best_ig > 0:
      left, right = make_partition(data, feature, threshold)

      if len(left) == 0 or len(right) == 0:
          label = data['hospital_death'].value_counts().idxmax()
          return label
      else:
          question = "{} <= {}".format(feature, threshold)
          subtree = {question: []}

          # 遞迴地構建左子樹和右子樹
          left_subtree = build_tree(left, max_depth, min_samples_split, depth + 1)
          right_subtree = build_tree(right, max_depth, min_samples_split, depth + 1)

          if left_subtree == right_subtree:
              subtree = left_subtree
          else:
              subtree[question].append(left_subtree)
              subtree[question].append(right_subtree)
              ans_features.append(feature)
              ans_thresholds.append(threshold)
  else:
      label = data['hospital_death'].value_counts().idxmax()
      return label
  ### END CODE HERE ###

  return subtree

#### Forest Function

In [112]:
def build_forest(data, n_trees, n_features, n_samples):
  ### START CODE HERE ###
  data_len = len(data)
  feature_list = data.columns.tolist()
  forest = []
  ### END CODE HERE ###

  # Create 'n_trees' number of trees and store each into the 'forest' list
  for i in range(n_trees):

    ### START CODE HERE ###
    selected_indices = np.random.choice(data_len, n_samples, replace=True)
    selected_datas = data.iloc[selected_indices]

    selected_features = np.random.choice(feature_list[:-1], n_features, replace=False)  # Exclude the target feature
    selected_features = selected_features.tolist()

    selected_features.append('hospital_death')
    ### END CODE HERE ###

    print(f"selected_datas = {selected_datas}")
    print(f"selected_features = {selected_features}")

    ### START CODE HERE ###
    # tree_data = pd.DataFrame()
    tree_data = selected_datas.copy()
    tree_data = tree_data[selected_features]
    ### END CODE HERE ###


    tree = build_tree(tree_data, max_depth, min_samples_split, depth)
    print(tree)
    forest.append(tree)

  return forest

In [114]:
forest = build_forest(training_data, n_trees, n_features, n_samples)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
...   ...                ...                ...         ...              ...   
499   ...              173.0               32.3        65.0             46.0   
1972  ...               64.0               20.0       126.0             30.0   
1970  ...               91.0               41.7       142.0             24.0   
3960  ...              102.0               27.0        50.0             48.0   
2720  ...               99.0               22.2        51.0             10.0   

      sodium_apache  temp_apache  wbc_apache  apache_4a_hospital_death_prob  \
5614          144.0         36.4        10.5                           0.01   
2138          117.0         36.1        19.8                           0.79   
4677          129.0         33.4         3.7                           0.32   
3374          139.0         36.9        20.9                           0.01   
6718          139.0         35.8        15.2               

#### Predict ans

In [124]:
def make_prediction_forest(forest, data):
  y_prediction = []
  predictions = []

  ### START CODE HERE ###
  for tree in forest:
      pred = make_prediction(tree, data)
      predictions.append(pred)

  predictions = np.array(predictions).T  # Transpose the predictions to get each sample's predictions from all trees

  # Loop through each row of 'predictions'
  for column_predictions in predictions:
      if np.sum(column_predictions == 1) > np.sum(column_predictions == 0):
          y_prediction.append(1)
      else:
          y_prediction.append(0)
    ### END CODE HERE ###

  return y_prediction

In [125]:
### START CODE HERE ###
pred_validation = make_prediction_forest(forest, x_validation)
score = calculate_score(y_validation, pred_validation)
print(score)
### END CODE HERE ###

0.7422


In [126]:
y_pred_test = make_prediction_forest(forest, advanced_testing_data)

#### Save the tree

In [127]:
advanced = []
for i in range(len(y_pred_test)):
  advanced.append(y_pred_test[i])

advanced_path = 'lab2_advanced.csv'

advanced_df = pd.DataFrame({'Id': range(len(advanced)), 'hospital_death': advanced})
advanced_df.set_index('Id', inplace=True)
advanced_df

advanced_df.to_csv(advanced_path, header = True, index = True)