In [3]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log2

# 数据集
data = pd.DataFrame({

    '收入': ['高', '中', '低', '高', '中'],
    '是否已婚': ['是', '否', '是', '是', '否'],
    '是否购买': [1, 0, 1, 0, 1]
})

In [4]:
# 计算信息熵
def entropy(data, target):
    # 计算总样本数
    total = len(data)
    # 计算每个类别的样本数
    counts = Counter(data[target])
    # 计算信息熵
    ent = 0
    for count in counts.values():
        p = count / total
        ent -= p * log2(p)
    return ent

In [5]:
# 计算信息增益  
def information_gain(data, feature, target):
    # 计算总信息熵
    total_entropy = entropy(data, target)
    # 计算每个特征值的信息熵
    feature_values = data[feature].unique()
    weighted_entropy = 0
    for value in feature_values:
        subset = data[data[feature] == value]
        p = len(subset) / len(data)
        weighted_entropy += p * entropy(subset, target)
    # 计算信息增益
    return total_entropy - weighted_entropy

In [6]:
# 计算基尼指数
def gini_index(data, target):
    # 计算总样本数
    total = len(data)
    # 计算每个类别的样本数
    counts = Counter(data[target])
    # 计算基尼指数
    gini = 1 - sum((count / total) ** 2 for count in counts.values())
    return gini


In [7]:
# 计算基尼增益
def gini_gain(data, feature, target):
    # 计算总基尼指数
    total_gini = gini_index(data, target)
    # 计算每个特征值的基尼指数
    feature_values = data[feature].unique()
    weighted_gini = 0
    for value in feature_values:
        subset = data[data[feature] == value]
        p = len(subset) / len(data)
        weighted_gini += p * gini_index(subset, target)
    # 计算基尼增益
    return total_gini - weighted_gini

In [8]:
# 计算信息增益率
def information_gain_ratio(data, feature, target):
    # 计算信息增益
    gain = information_gain(data, feature, target)
    # 计算特征的信息熵
    feature_entropy = entropy(data, feature)
    # 计算信息增益率
    if feature_entropy == 0:
        return 0
    return gain / feature_entropy

In [9]:
# 特征选择
features = [ '收入', '是否已婚']
target = '是否购买'

In [10]:
# 计算每个特征的指标
results = {}
for feature in features:
    results[feature] = {
        '信息增益': information_gain(data, feature, target),
        '基尼增益': gini_gain(data, feature, target),
        '信息增益率': information_gain_ratio(data, feature, target)
    }
# 输出结果
results_df = pd.DataFrame(results).T
results_df.columns = ['信息增益', '基尼增益', '信息增益率']
results_df = results_df.sort_values(by='信息增益', ascending=False)
print("特征选择结果：")
print(results_df)
# 选择最佳特征
best_feature = results_df.index[0]
print(f"最佳特征: {best_feature}")

特征选择结果：
          信息增益      基尼增益     信息增益率
收入    0.170951  0.080000  0.112325
是否已婚  0.019973  0.013333  0.020571
最佳特征: 收入


In [None]:
# 计算决策树深度
def calculate_tree_depth(data, target):
    # 计算树的深度
    if len(data[target].unique()) == 1:
        return 0
    if len(data) == 0:
        return 0
    depths = []
    for feature in data.columns[:-1]:
        feature_values = data[feature].unique()
        for value in feature_values:
            subset = data[data[feature] == value]
            depths.append(calculate_tree_depth(subset, target))
    return max(depths) + 1