In [5]:
import numpy as np

def gini_impurity(y):
  """
  Calculate the Gini impurity for a given set of labels.

  Args:
    y: A list or array of labels.

  Returns:
    The Gini impurity.
  """
  m = len(y)
  if m == 0:
    return 0
  counts = {}
  for label in y:
    if label not in counts:
      counts[label] = 0
    counts[label] += 1
  impurity = 1
  for label in counts:
    p_i = counts[label] / m
    impurity -= p_i ** 2
  return impurity

def cal_entropy(y):
  """
  Calculate the entropy for a given set of labels.

  Args:
    y: A list or array of labels.

  Returns:
    The entropy.
  """
  m = len(y)
  _, counts = np.unique(y, return_counts=True)
  probs = counts / m
  entropy = -np.sum(probs * np.log2(probs))
  return entropy


labels = [1, 1, 0, 0, 0]

print(f"Gini impurity: {gini_impurity(labels)}")
print(f"Entropy: {cal_entropy(labels):.4f}")

Gini impurity: 0.48
Entropy: 0.9710


In [8]:
def weighted_entropy(subsets, total_samples):
  weighted_entropy = 0
  for subset in subsets:
    subset_size = len(subset)
    subset_entropy = cal_entropy(subset)
    weighted_entropy += (subset_size / total_samples) * subset_entropy
  return weighted_entropy

def information_gain(total_entropy, subsets, total_samples):
    gain = total_entropy - weighted_entropy(subsets, total_samples)
    return gain

data = [
    [23, 0, 0, 0],
    [25, 1, 1, 0],
    [27, 1, 0, 1],
    [29, 0, 1, 1],
    [29, 0, 0, 0]
]

# Extract the target variable (Raise Salary)
raise_salary = [row[3] for row in data]

# Calculate entropy of the entire dataset
total_entropy = cal_entropy(raise_salary)

# Split based on 'Likes English'
subset_0 = [row[3] for row in data if row[1] == 0]
subset_1 = [row[3] for row in data if row[1] == 1]

# Total number of samples
total_samples = len(raise_salary)

# Calculate Gain for 'Likes English'
gain_likes_english = information_gain(total_entropy, [subset_0, subset_1], total_samples)

# Output the result
print(f"Entropy of the entire dataset: {total_entropy:.4f}")
print(f"Entropy of 'Likes English = 0' subset: {cal_entropy(subset_0):.4f}")
print(f"Entropy of 'Likes English = 1' subset: {cal_entropy(subset_1):.4f}")
print(f"Information Gain when splitting by 'Likes English': {gain_likes_english:.4f}")

Entropy of the entire dataset: 0.9710
Entropy of 'Likes English = 0' subset: 0.9183
Entropy of 'Likes English = 1' subset: 1.0000
Information Gain when splitting by 'Likes English': 0.0200


In [9]:
import numpy as np

ages = np.array([23, 25, 27, 29, 29])
likes_english = np.array([0, 1, 1, 0, 0])
likes_ai = np.array([0, 1, 0, 1, 0])
salaries = np.array([200, 400, 300, 500, 400])

def cal_sse(data, mean):
    return np.sum((data - mean) ** 2)

# SSE when 'Likes AI' is the root node
likes_ai_0 = salaries[likes_ai == 0]
likes_ai_1 = salaries[likes_ai == 1]

mean_likes_ai_0 = np.mean(likes_ai_0)
mean_likes_ai_1 = np.mean(likes_ai_1)

sse_likes_ai_0 = cal_sse(likes_ai_0, mean_likes_ai_0)
sse_likes_ai_1 = cal_sse(likes_ai_1, mean_likes_ai_1)

total_sse_likes_ai = sse_likes_ai_0 + sse_likes_ai_1
print(f"SSE when 'Likes AI' is the root node: {total_sse_likes_ai}")

# SSE when 'Age' is the root node with condition 'Age <= 24'
age_24 = salaries[ages <= 24]
age_above_24 = salaries[ages > 24]

mean_age_24 = np.mean(age_24)
mean_age_above_24 = np.mean(age_above_24)

sse_age_24 = cal_sse(age_24, mean_age_24)
sse_age_above_24 = cal_sse(age_above_24, mean_age_above_24)

total_sse_age = sse_age_24 + sse_age_above_24
print(f"SSE when 'Age <= 24' is the root node: {total_sse_age}")

SSE when 'Likes AI' is the root node: 25000.0
SSE when 'Age <= 24' is the root node: 20000.0
