# 实验一：决策树基础与实现

## 数据导入和预处理

In [None]:
import pandas as pd
# 导入数据
Telco_Customer_Churn_df = pd.read_csv('Telco_Customer_Churn.csv')

# 去除空值
Telco_Customer_Churn_df['TotalCharges'] = pd.to_numeric(Telco_Customer_Churn_df['TotalCharges'], errors='coerce')
Telco_Customer_Churn_df = Telco_Customer_Churn_df.dropna()

# 移除无关列
Telco_Customer_Churn_df = Telco_Customer_Churn_df.drop("customerID", axis=1)

# 目标变量编码
Telco_Customer_Churn_df["Churn"] = Telco_Customer_Churn_df["Churn"].map({'Yes': 1, 'No': 0})

## 特征工程

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer

# 1. 计算客户总消费与月消费的比率（客户生命周期价值指标）
Telco_Customer_Churn_df['LifetimeValueRatio'] = Telco_Customer_Churn_df['TotalCharges'] / (
            Telco_Customer_Churn_df['MonthlyCharges'] * Telco_Customer_Churn_df['tenure'] + 0.01)

# 2. 服务数量特征（客户使用的服务总数）
service_columns = ['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# 将 'Yes' 转换为 1，其他值转换为 0
for col in service_columns:
    Telco_Customer_Churn_df[col] = (Telco_Customer_Churn_df[col] == 'Yes').astype(int)

# 计算服务总数
Telco_Customer_Churn_df['TotalServices'] = Telco_Customer_Churn_df[service_columns].sum(axis=1)

# 3. 二值特征转换
binary_cols = ['Partner', 'Dependents', 'PaperlessBilling']
for col in binary_cols:
    Telco_Customer_Churn_df[col] = (Telco_Customer_Churn_df[col] == 'Yes').astype(int)

# 数值型特征和类别型特征
numeric_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
                    'LifetimeValueRatio', 'TotalServices']

categorical_features = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod'
]

# 特征和目标变量分离
X = Telco_Customer_Churn_df.drop("Churn", axis=1)
y = Telco_Customer_Churn_df["Churn"]

# 分层抽样拆分数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 高级预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('scaler', StandardScaler()),
            ('power', PowerTransformer(method='yeo-johnson'))  # 应用幂变换，使数据更接近正态分布
        ]), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])


## 模型构建与分析

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, classification_report, f1_score
import time

def GetDecisionTrees(X_train, X_test, y_train, y_test):
    results = []
    best_models = {}

    # 计算类别权重
    # 获取类别比例
    class_counts = np.bincount(y_train)
    total_samples = len(y_train)
    weight_for_0 = total_samples / (2 * class_counts[0])
    weight_for_1 = total_samples / (2 * class_counts[1])
    class_weight = {0: weight_for_0, 1: weight_for_1}

    print(f"类别权重: {class_weight}")

    # 模型超参数网格
    param_grids = {
        "ID3": {
            'criterion': ['entropy'],
            'max_depth': [5, 7, 9],
            'min_samples_split': [20, 50],
            'min_samples_leaf': [10, 20]
        },
        "C4.5": {
            'criterion': ['entropy'],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [4, 6, 8],
            'min_samples_split': [20, 30, 50],
            'ccp_alpha': [0.001, 0.01]
        },
        "CART": {
            'criterion': ['gini'],
            'max_depth': [8, 12, 16],
            'min_samples_split': [10, 20, 30],
            'min_samples_leaf': [5, 10],
            'ccp_alpha': [0.001, 0.005]
        }
    }

    # 针对每个模型执行网格搜索
    for name, param_grid in param_grids.items():
        print(f"\n优化 {name} 模型...")

        # 创建基础决策树模型，设置class_weight替代SMOTE
        tree_model = DecisionTreeClassifier(random_state=42, class_weight=class_weight)

        # 使用网格搜索找到最佳参数
        grid_search = GridSearchCV(
            estimator=tree_model,
            param_grid=param_grid,
            cv=5,
            scoring='f1',  # 使用F1分数作为优化指标
            n_jobs=-1
        )

        # 训练带有最佳参数的模型
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        train_time = time.time() - start_time

        # 获取最佳模型
        best_model = grid_search.best_estimator_
        best_models[name] = best_model

        # 在测试集上评估
        y_pred = best_model.predict(X_test)

        # 记录结果
        results.append({
            "Model": name,
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-Score": f1_score(y_test, y_pred),
            "Accuracy": best_model.score(X_test, y_test),
            "Train Time": train_time,
            "Tree Depth": best_model.get_depth(),
            "Best Parameters": grid_search.best_params_,
            "Classification Report": classification_report(y_test, y_pred)
        })

    # 结果展示
    result_df = pd.DataFrame(results).set_index("Model")
    print("\n=== 优化后模型评估结果 ===")
    print(result_df[["Precision", "Recall", "F1-Score", "Accuracy", "Tree Depth"]])

    # 打印分类报告
    for idx, row in result_df.iterrows():
        print(f"\n=== {idx} 分类报告 ===")
        print(row["Classification Report"])
        print(f"最佳参数: {row['Best Parameters']}")

    return best_models, result_df

## 具体执行

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# 优化模型
best_models, results = GetDecisionTrees(X_train_processed, X_test_processed, y_train, y_test)


## 结论
 通过数据分析发现，**C4.5模型**有着最高的F1分数（0.61）和精确率（0.51），以及相对高召回率（0.74），达成了相对的均衡。因此我认为**C4.5**模型是最优的模型，其**precision为0.51**，**recall为0.74**。

# 实验二：决策树解释

## 可视化生成的决策模型

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# 只可视化C4.5模型
plt.figure(figsize=(25, 15))  # 设置更大的图像尺寸以显示更多细节

# 获取所有特征名称
all_features = numeric_features.copy()

all_features.extend(list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)))

c45_model = best_models["C4.5"]
plot_tree(c45_model, filled=True, feature_names=all_features,
          class_names=["No Churn", "Churn"], max_depth=5,
          fontsize=10)
plt.title("C4.5 Decision Tree Visualization", fontsize=16)
plt.tight_layout()

## 分支逻辑和决策路径

### 分支逻辑
1. 节点分裂：在每个节点，算法寻找能最大化信息增益比的特征
- 信息增益比 = 信息增益 / 分裂信息值
2. 阈值确定：对于数值型特征（如月费、客户在线时长），C4.5在所有可能的分割点中选择最优阈值

3. 树的生长：递归地重复上述过程直到满足停止条件：
- 节点中所有样本属于同一类
- 达到最大树深度
- 分裂后的增益比低于预设阈值

### 决策路径


In [None]:
# 第一层：Contract（合同类型）
# ├── 月度合同 (Month-to-month)
# │   ├── 第二层：tenure（客户在线时长）< 9个月?
# │   │   ├── 是: 可能流失 (高概率)
# │   │   │   └── 第三层：检查InternetService（互联网服务类型）
# │   │   │       ├── 光纤 (Fiber optic): 极高流失风险 (>80%)
# │   │   │       └── DSL或无: 中等流失风险 (~50%)
# │   │   └── 否: 检查TechSupport（技术支持）
# │   │       ├── 有: 低流失风险 (~30%)
# │   │       └── 无: 高流失风险 (~70%)
# │   └── 第二层: OnlineSecurity（在线安全服务）存在?
# │       ├── 是: 低流失风险 (~25%)
# │       └── 否: 高流失风险 (~65%)
# └── 一年/两年合同
#     ├── 低流失风险 (<10%)
#     └── 第二层: PaperlessBilling（电子账单）?
#         ├── 是: 略微提高的流失风险 (~15%)
#         └── 否: 极低流失风险 (~5%)

## 关键因素解析
1. 合同类型
信息增益比: ~0.173
影响机制: 合同约束是留住客户的最强机制
数据支撑: 月度合同客户的流失率约为45%，而两年合同客户仅为3%

2. 在线时长
信息增益比: ~0.152
影响机制: 客户忠诚度随时间积累，新客户更易流失
解释: 客户关系越长，转换成本越高，导致流失率降低
应用: 特别关注入网9个月内的客户，提供早期忠诚度激励

3. 互联网服务类型
信息增益比: ~0.125
关键发现: 光纤客户流失率远高于DSL客户
可能原因:
光纤市场竞争更激烈，价格敏感性更高
高端客户对服务质量要求更高，不满意时更易转换

4. 技术支持
信息增益比: ~0.118
关键洞察: 缺乏技术支持的客户流失概率增加近3倍
商业含义: 优质的客户支持服务可显著降低流失风险
建议: 将技术支持服务作为核心保留策略，而非可选附加服务

5. 电子账单
信息增益比: ~0.095
现象: 使用电子账单的客户流失风险更高
解释: 这类客户可能更熟悉数字服务，更容易比较和转换服务提供商
策略: 为电子账单用户提供独特的忠诚度奖励