In [1]:
# AdaBoost和随机森林应用于宫颈癌风险因素数据集
# 该数据集包含约32个具有临床意义的风险因素特征

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

In [None]:
# 1. 加载宫颈癌风险因素数据集
print("加载宫颈癌风险因素数据集...")

# 数据集来源: UCI Machine Learning Repository
# 该数据集包含以下临床有意义的特征，可用于预测宫颈癌风险和筛查结果
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv"
#请在这完成数据集加载

In [None]:
# 2. 数据探索
print("\n数据集基本信息:")
print(f"原始样本数量: {data.shape[0]}")
print(f"原始特征数量: {data.shape[1]}")

print("\n目标变量分布:")
target_columns = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
for col in target_columns:
    positive_count = data[col].sum()
    print(f"{col}: 阳性 {positive_count} 样本 ({positive_count/len(data):.2%})")

# 我们将使用Biopsy(活检)作为主要目标变量，这是金标准
target_col = 'Biopsy'
print(f"\n选择 {target_col} 作为主要目标变量")

# 检查缺失值
print("\n缺失值概况:")
missing_percent = data.isnull().sum() / len(data) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)
print(missing_percent.head(10))

# 检查特征中的'?'值（在某些特征中表示缺失）
print("\n检查'?'值:")
for col in data.columns:
    if data[col].dtype == object:
        q_count = (data[col] == '?').sum()
        if q_count > 0:
            print(f"{col}: {q_count} '?'值 ({q_count/len(data):.2%})")


In [None]:
# 3. 数据预处理
print("\n进行数据预处理...")

# 请实现替换'?'为NaN


# 将所有列转换为数值类型
for col in data.columns:
    if data[col].dtype == object:
        data[col] = pd.to_numeric(data[col], errors='coerce')

# 显示主要特征组
demographic_features = ['Age', 'Number of sexual partners', 'First sexual intercourse',
                        'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)']

contraceptive_features = ['Hormonal Contraceptives', 'Hormonal Contraceptives (years)',
                          'IUD', 'IUD (years)']

std_features = ['STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis',
               'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
               'STDs:syphilis', 'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
               'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B',
               'STDs:HPV']

other_medical_features = ['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
                         'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx',
                         'Dx:Hinselmann', 'Dx:Schiller', 'Dx:Citology']

# 移除非特征和目标变量之外的筛查方法结果
features = data.drop(target_columns, axis=1)
y = data[target_col]

# 处理缺失值 - 临床上合理的方法
print("\n处理缺失值...")

# 对于大多数特征使用中位数填充
imputer = SimpleImputer(strategy='median')
features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# 特殊处理: 将STD相关的NA视为0(无STD)
std_cols = [col for col in features.columns if 'STD' in col]
for col in std_cols:
    na_indices = features[col].isna()
    features_imputed.loc[na_indices, col] = 0

# 记录填充后的数据
X = features_imputed

print(f"\n预处理后的特征数量: {X.shape[1]}")
print(f"预处理后的样本数量: {X.shape[0]}")

# 列出所有模型将使用的特征
print("\n模型将使用的特征列表:")
for i, col in enumerate(X.columns):
    print(f"{i+1}. {col}")


In [None]:
# 5. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# 标准化特征（对AdaBoost不是必须的，但对比较有帮助）
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

print(f"\n训练集样本数: {X_train.shape[0]}")
print(f"测试集样本数: {X_test.shape[0]}")

In [None]:
# 6. AdaBoost模型训练
print("训练AdaBoost模型...")


# 创建AdaBoost分类器
# 使用DecisionTreeClassifier作为基学习器，限制深度以防过拟合

# 训练基础AdaBoost模型


# 初始评估
ada_train_preds = ada_model.predict(X_train_scaled)
ada_test_preds = ada_model.predict(X_test_scaled)

print(f"初始AdaBoost模型训练集准确率: {accuracy_score(y_train, ada_train_preds):.4f}")
print(f"初始AdaBoost模型测试集准确率: {accuracy_score(y_test, ada_test_preds):.4f}")

In [None]:
# AdaBoost超参数调优
print("\n进行AdaBoost超参数调优...")
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.5, 1.0, 1.5],
    'base_estimator__max_depth': [1, 2, 3]  # 决策桩的最大深度
}

ada_grid_search = GridSearchCV(
    estimator=AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(),
        random_state=42
    ),
    param_grid=ada_param_grid,
    scoring='roc_auc',  # 对不平衡数据集使用AUC更合适
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1
)

ada_grid_result = ada_grid_search.fit(X_train_scaled, y_train)

print("\nAdaBoost最佳超参数:")
print(ada_grid_result.best_params_)
print(f"AdaBoost最佳交叉验证ROC-AUC: {ada_grid_result.best_score_:.4f}")
# 使用最佳参数的AdaBoost模型
best_ada_model = ada_grid_result.best_estimator_


In [None]:
# 7. 随机森林模型训练
print("训练随机森林模型...")

# 创建随机森林分类器


# 训练基础随机森林模型


# 初始评估
rf_train_preds = rf_model.predict(X_train_scaled)
rf_test_preds = rf_model.predict(X_test_scaled)

print(f"初始随机森林模型训练集准确率: {accuracy_score(y_train, rf_train_preds):.4f}")
print(f"初始随机森林模型测试集准确率: {accuracy_score(y_test, rf_test_preds):.4f}")

In [None]:
# 随机森林超参数调优
print("\n进行随机森林超参数调优...")
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(
        random_state=42,
        class_weight='balanced'
    ),
    param_grid=rf_param_grid,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1
)

rf_grid_result = rf_grid_search.fit(X_train_scaled, y_train)

print("\n随机森林最佳超参数:")
print(rf_grid_result.best_params_)
print(f"随机森林最佳交叉验证ROC-AUC: {rf_grid_result.best_score_:.4f}")

# 使用最佳参数的随机森林模型
best_rf_model = rf_grid_result.best_estimator_

In [None]:
# 8. 模型对比评估
print("模型对比评估")

models = {
    'AdaBoost': best_ada_model,
    'Random Forest': best_rf_model
}

results = {}

for name, model in models.items():
    print(f"\n{name} 模型评估:")
    print("-" * 40)
    
    # 预测
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # 计算各项指标

    
    # ROC-AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    print(f"准确率: {accuracy:.4f}")
    print(f"精确率: {precision:.4f}")
    print(f"召回率: {recall:.4f}")
    print(f"F1分数: {f1:.4f}")
    print(f"AUC: {roc_auc:.4f}")
    
    # 存储结果
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n混淆矩阵:")
    print(cm)
    
    print(f"\n分类报告:")
    print(classification_report(y_test, y_pred))

In [None]:
# 9. 特征重要性分析
print("特征重要性分析")

# AdaBoost特征重要性
print("\nAdaBoost前10个最重要特征:")
ada_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_ada_model.feature_importances_
}).sort_values('importance', ascending=False)

print(ada_feature_importance.head(10))

# 随机森林特征重要性
print("\n随机森林前10个最重要特征:")
rf_feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(rf_feature_importance.head(10))

In [None]:
# 10. 可视化
plt.figure(figsize=(15, 12))

# 可视化ROC曲线对比
plt.subplot(2, 3, 1)
for name, result in results.items():
    plt.plot(result['fpr'], result['tpr'], lw=2, 
             label=f'{name} (AUC = {result["auc"]:.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")

# 将两个模型的性能指标进行对比
plt.subplot(2, 3, 2)
metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']
ada_scores = [results['AdaBoost'][metric] for metric in metrics]
rf_scores = [results['Random Forest'][metric] for metric in metrics]

x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, ada_scores, width, label='AdaBoost', alpha=0.8)
plt.bar(x + width/2, rf_scores, width, label='Random Forest', alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Performance Metrics Comparison')
plt.xticks(x, metrics, rotation=45)
plt.legend()
plt.ylim(0, 1)

# 可视化AdaBoost top10特征
plt.subplot(2, 3, 3)
top_ada_features = ada_feature_importance.head(10)
plt.barh(range(len(top_ada_features)), top_ada_features['importance'])
plt.yticks(range(len(top_ada_features)), top_ada_features['feature'])
plt.xlabel('Importance')
plt.title('AdaBoost Top 10 Feature Importance')
plt.gca().invert_yaxis()

# 可视化随机森林 top10特征
plt.subplot(2, 3, 4)
top_rf_features = rf_feature_importance.head(10)
plt.barh(range(len(top_rf_features)), top_rf_features['importance'])
plt.yticks(range(len(top_rf_features)), top_rf_features['feature'])
plt.xlabel('Importance')
plt.title('Random Forest Top 10 Feature Importance')
plt.gca().invert_yaxis()

# 可视化AdaBoost混淆矩阵


# 可视化随机森林混淆矩阵


plt.tight_layout()
plt.savefig('cervical_cancer_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()