# 客户转化模型 - 执行流程

这个笔记本提供了执行 `main.py` 中各个功能模块的交互式界面，包括：

1. **检查特征文件** - 检查特征文件是否有重复
2. **合并特征文件** - 合并多个特征文件
3. **训练模型** - 训练客户转化模型
4. **超参数调优** - 优化模型参数
5. **部署模型** - 部署模型并进行预测

In [None]:
# 环境设置
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Optional, Union, Any

# 添加项目根目录到路径
notebook_dir = Path.cwd()
project_dir = notebook_dir.parent
sys.path.append(str(project_dir))

# 导入项目模块
from src.core.preprocessing import (
    check_feature_files,
    merge_feature_files,
    preprocess_data,
)
from src.models.deployment import batch_prediction, deploy_model, load_feature_list
from src.models.hyperopt_tuning import hyperopt_xgb, plot_optimization_results
from src.models.training import two_stage_modeling_pipeline

# 设置可视化参数
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [None]:
# 显示项目数据目录结构
def list_directory(path, indent=0):
    """列出目录结构"""
    path = Path(path)
    if not path.exists():
        print(f"路径不存在: {path}")
        return
    
    items = list(path.iterdir())
    for item in sorted(items, key=lambda x: (not x.is_dir(), x.name)):
        if item.name.startswith('.'):
            continue
        
        prefix = "    " * indent
        if item.is_dir():
            print(f"{prefix}📁 {item.name}/")
            if indent < 2:  # 限制递归深度
                list_directory(item, indent + 1)
        else:
            size_mb = item.stat().st_size / (1024 * 1024)
            if size_mb > 1:
                size_str = f"{size_mb:.1f}MB"
            else:
                size_kb = item.stat().st_size / 1024
                size_str = f"{size_kb:.1f}KB"
            print(f"{prefix}📄 {item.name} ({size_str})")

print("项目目录结构:")
list_directory(project_dir)

## 1. 检查特征文件

检查特征文件是否有重复特征或其他问题。

In [None]:
def run_check_features(feature_files=None):
    """检查特征文件"""
    if feature_files is None:
        # 如果未提供文件列表，尝试查找数据目录中的所有CSV文件
        data_dir = project_dir / "data"
        if data_dir.exists():
            feature_files = list(data_dir.glob("*.csv"))
            feature_files = [str(f) for f in feature_files]
            print(f"找到 {len(feature_files)} 个特征文件:")
            for f in feature_files:
                print(f"  - {f}")
        else:
            print(f"数据目录不存在: {data_dir}")
            return
    
    if not feature_files:
        print("没有找到特征文件")
        return
    
    print(f"检查 {len(feature_files)} 个特征文件是否有重复...")
    results = check_feature_files(feature_files)
    
    if results['status'] == 'success':
        print("✅ 没有发现问题")
    else:
        print(f"⚠️ 发现 {len(results['issues'])} 个问题:")
        for issue in results['issues']:
            print(f"  - {issue}")
    
    return results

In [None]:
# 运行特征文件检查
# 如果需要指定文件，请替换下面的None
feature_files = None  # 例如: ["data/feature1.csv", "data/feature2.csv"]
check_results = run_check_features(feature_files)

## 2. 合并特征文件

将多个特征文件合并为一个数据集。

In [None]:
def run_merge_features(feature_files=None, sample_file=None, output_file=None):
    """合并特征文件"""
    if feature_files is None:
        # 如果未提供文件列表，尝试查找数据目录中的所有CSV文件
        data_dir = project_dir / "data"
        if data_dir.exists():
            feature_files = list(data_dir.glob("*.csv"))
            feature_files = [str(f) for f in feature_files]
            print(f"找到 {len(feature_files)} 个特征文件:")
            for f in feature_files:
                print(f"  - {f}")
        else:
            print(f"数据目录不存在: {data_dir}")
            return
    
    if not feature_files:
        print("没有找到特征文件")
        return
    
    print(f"合并 {len(feature_files)} 个特征文件...")
    merged_df = merge_feature_files(feature_files, sample_file)
    
    # 保存合并的数据集
    if output_file is None:
        output_file = str(project_dir / "merged_features.csv")
    
    merged_df.to_csv(output_file, index=False)
    print(f"✅ 已将合并后的数据集保存到 {output_file}")
    print(f"数据集大小: {len(merged_df)} 行 x {len(merged_df.columns)} 列")
    
    # 显示一些基本统计信息
    print("\n数据集概览:")
    display(merged_df.head())
    
    return merged_df

In [None]:
# 运行特征文件合并
# 如需指定特定文件，请修改下面的参数
feature_files = None  # 例如: ["data/feature1.csv", "data/feature2.csv"]
sample_file = None    # 例如: "data/sample.csv"
output_file = None    # 例如: "merged_data.csv"

merged_df = run_merge_features(feature_files, sample_file, output_file)

## 3. 训练模型

训练客户转化模型。

In [None]:
def run_train_model(data_file=None, target='label_apply', resume_from=None):
    """训练模型"""
    if data_file is None:
        # 如果未提供数据文件，尝试使用合并的特征文件
        default_data = project_dir / "merged_features.csv"
        if default_data.exists():
            data_file = str(default_data)
        else:
            print(f"数据文件不存在: {default_data}")
            print("请先合并特征文件或指定数据文件路径")
            return
    
    print(f"正在从 {data_file} 加载数据...")
    
    # 加载数据
    if data_file.endswith('.csv'):
        df = pd.read_csv(data_file)
    elif data_file.endswith('.parquet'):
        df = pd.read_parquet(data_file)
    else:
        print(f"错误: 不支持的文件格式: {data_file}")
        return
    
    print(f"数据加载完成，大小: {df.shape}")
    
    # 检查目标变量
    if target not in df.columns:
        print(f"错误: 目标变量 '{target}' 不在数据集中")
        print(f"可用列: {', '.join(df.columns[:10])}...")
        return
    
    # 预处理和拆分数据
    print("预处理数据...")
    train_df, test_df = preprocess_data(df, target=target)
    
    print(f"训练集大小: {train_df.shape}")
    print(f"测试集大小: {test_df.shape}")
    
    # 运行两阶段建模流程
    print("开始训练模型...")
    results = two_stage_modeling_pipeline(
        train_df, 
        test_df, 
        target=target,
        resume_from=resume_from
    )
    
    print("✅ 训练完成")
    
    # 显示结果摘要
    if isinstance(results, dict) and 'metrics' in results:
        print("\n模型评估指标:")
        for metric, value in results['metrics'].items():
            print(f"  - {metric}: {value}")
    
    return results

In [None]:
# 运行模型训练
# 可修改以下参数
data_file = None  # 例如: "merged_features.csv"
target = 'label_apply'  # 目标变量列名
resume_from = None  # 可选: 'start', 'initial_model', 'feature_analysis', 'feature_selection', 'final_model'

training_results = run_train_model(data_file, target, resume_from)

## 4. 超参数调优

对模型进行超参数优化。

In [None]:
def run_hyperparameter_tuning(data_file=None, target='label_apply', max_evals=50):
    """超参数调优"""
    # 检查特征文件是否存在
    features_file = project_dir / "funnel_models" / "selected_features.txt"
    
    if not features_file.exists():
        print(f"错误: 特征列表文件不存在: {features_file}")
        print("请先运行训练模式")
        return
    
    # 决定是加载原始数据还是使用已有的训练/测试数据
    if data_file:
        print(f"从 {data_file} 加载数据...")
        
        # 加载数据
        if data_file.endswith('.csv'):
            df = pd.read_csv(data_file)
        elif data_file.endswith('.parquet'):
            df = pd.read_parquet(data_file)
        else:
            print(f"错误: 不支持的文件格式: {data_file}")
            return
        
        print(f"数据加载完成，大小: {df.shape}")
        
        # 预处理和拆分数据
        print("预处理数据...")
        train_df, test_df = preprocess_data(df, target=target)
    else:
        print("尝试从 funnel_models 目录加载已处理的数据...")
        
        # 尝试加载保存的文件
        train_file = project_dir / "funnel_models" / "train.csv"
        test_file = project_dir / "funnel_models" / "test.csv"
        
        if not train_file.exists() or not test_file.exists():
            print(f"错误: 训练数据文件不存在: {train_file} 或 {test_file}")
            print("请指定数据文件路径或先运行训练模式")
            return
        
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
        
        print(f"数据加载完成，训练集: {train_df.shape}, 测试集: {test_df.shape}")
    
    # 加载特征列表
    features = load_feature_list(str(features_file))
    print(f"从 {features_file} 加载了 {len(features)} 个特征")
    
    # 运行超参数优化
    print(f"开始超参数优化 (最大评估次数: {max_evals})...")
    results = hyperopt_xgb(
        train_df,
        test_df,
        features,
        target=target,
        max_evals=max_evals
    )
    
    # 创建可视化图表
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    results_dir = project_dir / "optimization_results"
    results_dir.mkdir(exist_ok=True)
    
    results_file = results_dir / f"{target}_{timestamp}_results.json"
    print(f"绘制优化结果图表，保存到 {results_file}...")
    plot_optimization_results(str(results_file))
    
    print("✅ 超参数调优完成")
    
    # 显示最佳参数
    if 'best_params' in results:
        print("\n最佳参数:")
        for param, value in results['best_params'].items():
            print(f"  - {param}: {value}")
        
        if 'best_score' in results:
            print(f"\n最佳得分: {results['best_score']}")
    
    return results

In [None]:
# 运行超参数调优
# 可修改以下参数
data_file = None  # 如果不指定，将使用 funnel_models 中的训练/测试数据
target = 'label_apply'  # 目标变量列名
max_evals = 50  # 最大评估次数，可根据需要调整

tuning_results = run_hyperparameter_tuning(data_file, target, max_evals)

## 5. 部署模型

部署模型并进行预测。

In [None]:
def run_model_deployment(model_file=None, data_file=None, target=None, features_file=None, 
                         key_column='input_key', output_file=None, threshold=0.5):
    """部署模型"""
    # 检查模型文件
    if model_file is None:
        # 尝试查找最新的模型文件
        model_dir = project_dir / "funnel_models"
        if model_dir.exists() and (model_dir / "final_model.pkl").exists():
            model_file = str(model_dir / "final_model.pkl")
        else:
            # 也可以查找tuned_models目录
            tuned_dir = project_dir / "tuned_models"
            if tuned_dir.exists():
                model_files = list(tuned_dir.glob("*.pkl"))
                if model_files:
                    # 按修改时间排序，选择最新的
                    model_file = str(sorted(model_files, key=lambda x: x.stat().st_mtime, reverse=True)[0])
    
    if model_file is None or not os.path.exists(model_file):
        print(f"错误: 模型文件不存在")
        print("请先训练模型或指定模型文件路径")
        return
    
    print(f"使用模型: {model_file}")
    
    # 检查数据文件
    if data_file is None:
        # 尝试使用合并的特征文件
        default_data = project_dir / "merged_features.csv"
        if default_data.exists():
            data_file = str(default_data)
        else:
            print(f"错误: 数据文件不存在")
            print("请指定数据文件路径")
            return
    
    print(f"从 {data_file} 加载数据...")
    
    # 加载数据
    if data_file.endswith('.csv'):
        df = pd.read_csv(data_file)
    elif data_file.endswith('.parquet'):
        df = pd.read_parquet(data_file)
    else:
        print(f"错误: 不支持的文件格式: {data_file}")
        return
    
    print(f"数据加载完成，大小: {df.shape}")
    
    # 检查特征文件
    if features_file is None:
        default_features = project_dir / "funnel_models" / "selected_features.txt"
        if default_features.exists():
            features_file = str(default_features)
        else:
            print(f"错误: 特征列表文件不存在")
            print("请指定特征列表文件路径")
            return
    
    # 部署模型
    if target and target in df.columns:
        print(f"部署模型并对 {target} 进行评估...")
        
        results = deploy_model(
            model_file,
            df,
            target=target,
            features_file=features_file,
            threshold=threshold
        )
        
        # 显示评估结果
        if isinstance(results, dict) and 'metrics' in results:
            print("\n模型评估指标:")
            for metric, value in results['metrics'].items():
                print(f"  - {metric}: {value}")
    else:
        print("部署模型进行批量预测...")
        
        if output_file is None:
            # 设置默认输出文件
            deploy_dir = project_dir / "deployment_results"
            deploy_dir.mkdir(exist_ok=True)
            timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
            output_file = str(deploy_dir / f"predictions_{timestamp}.csv")
        
        results = batch_prediction(
            model_file,
            df,
            key_column=key_column,
            features_file=features_file,
            output_file=output_file,
            threshold=threshold
        )
        
        print(f"✅ 预测完成，结果保存到 {output_file}")
        
        # 显示预测结果预览
        if isinstance(results, pd.DataFrame):
            print("\n预测结果预览:")
            display(results.head())
            
            # 统计预测情况
            if 'prediction' in results.columns:
                pos_count = results['prediction'].sum()
                total = len(results)
                print(f"\n预测为正例的样本: {pos_count} ({pos_count/total:.2%})")
    
    print("✅ 部署完成")
    return results

In [None]:
# 运行模型部署
# 可修改以下参数
model_file = None  # 例如: "funnel_models/final_model.pkl"
data_file = None   # 例如: "merged_features.csv" 或 "new_data.csv"
target = 'label_apply'  # 如果数据中有目标变量，可以对模型进行评估
features_file = None  # 例如: "funnel_models/selected_features.txt"
key_column = 'input_key'  # 用于批量预测的键列
output_file = None  # 预测结果输出文件
threshold = 0.5  # 分类阈值

deployment_results = run_model_deployment(
    model_file, data_file, target, features_file, 
    key_column, output_file, threshold
)

## 综合工作流演示

以下展示一个完整的工作流程示例，从特征合并到模型部署。

In [None]:
# 完整流程示例
def run_complete_workflow(target='label_apply'):
    """
    运行完整的工作流程：
    1. 合并特征文件
    2. 训练模型
    3. 超参数调优
    4. 模型部署
    """
    print("======================")
    print("🚀 启动完整工作流程")
    print("======================")
    
    # 步骤1: 合并特征文件
    print("\n📁 步骤1: 合并特征文件")
    merged_df = run_merge_features()
    if merged_df is None:
        print("❌ 特征合并失败，工作流程终止")
        return
    
    # 步骤2: 训练模型
    print("\n🔧 步骤2: 训练模型")
    training_results = run_train_model(target=target)
    if training_results is None:
        print("❌ 模型训练失败，工作流程终止")
        return
    
    # 步骤3: 超参数调优
    print("\n⚙️ 步骤3: 超参数调优")
    tuning_results = run_hyperparameter_tuning(target=target, max_evals=10)  # 降低评估次数以加快示例
    if tuning_results is None:
        print("❌ 超参数调优失败，工作流程继续但使用原始模型")
    
    # 步骤4: 模型部署
    print("\n🚀 步骤4: 模型部署")
    deployment_results = run_model_deployment(target=target)
    
    print("\n======================")
    print("✅ 工作流程完成")
    print("======================")
    
    return {
        'training': training_results,
        'tuning': tuning_results,
        'deployment': deployment_results
    }

# 取消下面的注释来运行完整工作流程
# workflow_results = run_complete_workflow()