In [None]:
# !pip install C:\Users\archi\Downloads\TA_Lib-0.4.28-cp310-cp310-win_amd64.whl
# !pip install pyyaml
# !pip install psutil

In [None]:
# 系統和路徑相關
import os
import sys
import warnings
import gc
from pathlib import Path

# 設置專案路徑
project_path = r'C:\Users\archi\Python\Project\tw_stock_analysis'
sys.path.append(project_path)

# 日期和時間
import time
from datetime import datetime

# 數據處理和分析
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import talib

# 並行處理
from concurrent.futures import ThreadPoolExecutor, as_completed

# 類型提示
from typing import Dict, List, Tuple, Optional, Any

# 數據類
from dataclasses import dataclass, field

# 工具函數
import logging
import traceback
import shutil
import json
import psutil
from tqdm import tqdm

# 專案特定導入
from utils.config_utils import ConfigLoader, config_to_yaml
import concurrent.futures

# 關閉警告
warnings.filterwarnings('ignore')

In [None]:
@dataclass
class FeatureConfig:
    """特徵生成配置類"""
    
    # 基礎路徑設定
    BASE_DIR: str = "D:/Min/Python/Project/FA_Data"
    META_DATA_DIR: str = "meta_data"
    BACKUP_DIR: str = "backup"
    LOG_DIR: str = "logs"
    FEATURES_DIR: str = "features"
    INDUSTRY_ANALYSIS_DIR: str = "industry_analysis"
    INDUSTRY_CORRELATION_DIR: str = "industry_correlation"
    INDUSTRY_FILE_FORMAT: str = "{industry}_{start_date}_{end_date}_{report_date}.json"
    
    # 資料處理參數
    DATA_PROCESSING: Dict = field(default_factory=lambda: {
        'min_data_points': 30,  # 最小數據點數量
        'backup_days': 7,       # 備份保留天數
        'batch_size': 1000      # 批次處理大小
    })
    
    # 特徵配置
    FEATURE_PARAMS: Dict = field(default_factory=lambda: {
        'volume': {
            'short_period': 5,
            'long_period': 20,
            'volume_ma_periods': [5, 10, 20],
            'volume_threshold': 2.0
        },
        'volatility': {
            'short_period': 5,
            'long_period': 20,
            'std_window': 20,
            'atr_period': 14
        },
        'trend': {
            'ma_period': 20,
            'channel_period': 20,
            'momentum_periods': [5, 10, 20],
            'trend_threshold': 0.02
        },
        'technical': {
            'rsi_period': 14,
            'ma_periods': [5, 10, 20, 60],
            'macd_params': {
                'fast_period': 12,
                'slow_period': 26,
                'signal_period': 9
            },
            'kd_params': {
                'k_period': 9,
                'smooth_k': 3,
                'smooth_d': 3
            },
            'bollinger_params': {
                'window': 20,
                'num_std': 2
            }
        }
    })
    
    # 產業分析參數 (新的部分放這裡)
    INDUSTRY_PARAMS: Dict = field(default_factory=lambda: {
        'analysis_start_date': '20230101',  # 分析起始日期
        'analysis_end_date': '20241213',    # 分析結束日期
        'lookback_period': 120,             # 回顧期間
        'momentum_period': 20,              # 動能計算期間
        'correlation_window': 30,           # 相關性計算窗口
        'min_periods': 20                   # 最小有效數據點數
    })
    
    # 特徵選擇參數
    FEATURE_SELECTION: Dict = field(default_factory=lambda: {
        'correlation_threshold': 0.85,  # 相關性閾值
        'importance_threshold': 0.01    # 重要性閾值
    })
    
    # 資料驗證參數
    DATA_VALIDATION: Dict = field(default_factory=lambda: {
        'price_min': 1.0,      # 最小價格
        'price_max': 10000.0,  # 最大價格
        'volume_min': 1000,    # 最小成交量
        'missing_threshold': 0.3  # 缺失值比例閾值
    })
    
    # 日誌設定
    LOGGING: Dict = field(default_factory=lambda: {
        'file_level': 'DEBUG',
        'console_level': 'INFO',
        'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        'file_encoding': 'utf-8'
    })
    
    def __post_init__(self):
        """初始化後設定路徑"""
        # 原有的路徑設置保持不變
        self.base_path = Path(self.BASE_DIR)
        self.meta_data_path = self.base_path / self.META_DATA_DIR
        self.backup_path = self.meta_data_path / self.BACKUP_DIR
        self.log_path = self.base_path / self.LOG_DIR
        self.features_path = self.base_path / self.FEATURES_DIR
        
        # 新增產業分析相關路徑
        self.industry_analysis_path = self.base_path / self.INDUSTRY_ANALYSIS_DIR
        self.industry_correlation_path = self.base_path / self.INDUSTRY_CORRELATION_DIR
        
        # 確保所需目錄存在
        self._ensure_directories()
    
    def _ensure_directories(self):
        """確保必要的目錄結構存在"""
        directories = [
            self.base_path,
            self.meta_data_path,
            self.backup_path,
            self.log_path,
            self.features_path,
            # 新增產業分析目錄
            self.industry_analysis_path,
            self.industry_correlation_path
        ]
        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)
            
    # 新增產業分析相關路徑獲取方法
    def get_industry_analysis_path(self) -> Path:
        """獲取產業分析文件路徑"""
        return self.industry_analysis_path
    
    def get_industry_correlation_path(self) -> Path:
        """獲取產業相關性文件路徑"""
        return self.industry_correlation_path
    
    def get_stock_data_path(self) -> Path:
        """獲取股票數據文件路徑"""
        return self.meta_data_path / 'stock_data_whole.csv'
    
    def get_technical_path(self) -> Path:
        """獲取技術指標文件路徑"""
        return self.meta_data_path / 'all_stocks_data.csv'
    
    def get_enhanced_features_path(self) -> Path:
        """獲取增強特徵文件路徑"""
        return self.meta_data_path / 'enhanced_features.csv'
    
    def get_log_path(self) -> Path:
        """獲取日誌文件路徑"""
        return self.log_path / f"feature_generator_{datetime.now().strftime('%Y%m%d')}.log"
    
    def get_backup_path(self, filename: str) -> Path:
        """獲取備份文件路徑"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        return self.backup_path / f"{filename}_{timestamp}"
    
    def validate_config(self) -> bool:
        """驗證配置參數的有效性"""
        try:
            # 驗證路徑
            if not os.path.exists(self.BASE_DIR):
                raise ValueError(f"基礎路徑不存在: {self.BASE_DIR}")
            
            # 驗證數值參數
            if self.DATA_PROCESSING['min_data_points'] < 1:
                raise ValueError("最小數據點數必須大於0")
            
            if self.DATA_PROCESSING['backup_days'] < 1:
                raise ValueError("備份保留天數必須大於0")
            
            # 驗證特徵參數
            for feature_type in ['volume', 'volatility', 'trend', 'technical']:
                if feature_type not in self.FEATURE_PARAMS:
                    raise ValueError(f"缺少 {feature_type} 特徵參數")
            
            return True
            
        except Exception as e:
            print(f"配置驗證失敗: {str(e)}")
            return False
    
    def to_dict(self) -> Dict:
        """將配置轉換為字典格式"""
        return {
            'BASE_DIR': str(self.base_path),
            'META_DATA_DIR': str(self.meta_data_path),
            'BACKUP_DIR': str(self.backup_path),
            'LOG_DIR': str(self.log_path),
            'FEATURES_DIR': str(self.features_path),
            'DATA_PROCESSING': self.DATA_PROCESSING,
            'FEATURE_PARAMS': self.FEATURE_PARAMS,
            'FEATURE_SELECTION': self.FEATURE_SELECTION,
            'DATA_VALIDATION': self.DATA_VALIDATION,
            'LOGGING': self.LOGGING
        }

In [None]:
class FeatureLogger:
    _instance = None  # 單例模式
    
    def __new__(cls, config=None):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    
    def __init__(self, config=None):
        if self._initialized:
            return
            
        self.logger = logging.getLogger('FeatureGenerator')
        self.logger.setLevel(logging.INFO)
        
        # 避免重複添加 handler
        if not self.logger.handlers:
            # 檔案處理
            if config and hasattr(config, 'log_path'):
                fh = logging.FileHandler(
                    config.log_path / f'feature_generator_{datetime.now():%Y%m%d}.log',
                    encoding='utf-8'
                )
                fh.setLevel(logging.DEBUG)
                fh.setFormatter(logging.Formatter(
                    '%(asctime)s [%(levelname)s] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'
                ))
                self.logger.addHandler(fh)
            
            # 控制台輸出
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            ch.setFormatter(logging.Formatter(
                '%(levelname)s: %(message)s'
            ))
            self.logger.addHandler(ch)
            
        self._initialized = True

    def info(self, msg: str):
        """記錄一般資訊"""
        self.logger.info(msg)
    
    def warning(self, msg: str):
        """記錄警告"""
        self.logger.warning(msg)
    
    def error(self, msg: str):
        """記錄錯誤"""
        self.logger.error(msg)
    
    def debug(self, msg: str):
        """記錄除錯資訊"""
        self.logger.debug(msg)

In [None]:
class FeatureGenerator:
    """特徵生成器類"""
    
    def __init__(self, config: FeatureConfig):
        self.config = config
        self.logger = logging.getLogger(__name__)
        
        # 加入產業資料快取
        self.industry_cache = {}
        self.technical_cache = {}
        self.validate_data_structure()
        
        # 初始化記憶體監控
        self.memory_tracker = {
            'last_check': time.time(),
            'check_interval': 300,  # 每5分鐘檢查一次
            'warning_threshold': 12,  # GB
            'critical_threshold': 16  # GB
        }
        
        # 初始化效能追蹤
        self.performance_tracker = {
            'processed_stocks': 0,
            'start_time': time.time(),
            'last_gc_time': time.time(),
            'gc_interval': 60  # 每60秒執行一次GC
        }
        
        # 預加載常用的產業對應關係
        self._preload_industry_mappings()

    def _initialize_caches(self):
        """初始化快取"""
        self.industry_cache = {}
        self.technical_cache = {}
        self._industry_mapping_cache = {}
        
    def _check_memory_usage(self):
        """檢查記憶體使用狀況"""
        current_time = time.time()
        
        # 定期檢查
        if current_time - self.memory_tracker['last_check'] > self.memory_tracker['check_interval']:
            current_memory = psutil.Process().memory_info().rss / 1024 / 1024 / 1024  # GB
            
            # 警告級別
            if current_memory > self.memory_tracker['warning_threshold']:
                self.logger.warning(f"記憶體使用量偏高: {current_memory:.2f}GB")
                self._clean_cache()
                gc.collect()
            
            # 緊急級別
            if current_memory > self.memory_tracker['critical_threshold']:
                self.logger.error(f"記憶體使用量嚴重超標: {current_memory:.2f}GB")
                self._emergency_cleanup()
            
            self.memory_tracker['last_check'] = current_time
    
    def _preload_industry_mappings(self):
        """預加載產業分類對應關係"""
        try:
            self.logger.info("開始載入產業分類對應關係...")
            
            mapping_file = self.config.meta_data_path / 'companies_final.csv'
            if not mapping_file.exists():
                self.logger.error("找不到產業分類檔案: companies_final.csv")
                self.industry_mappings = {}
                return
                
            # 讀取並處理資料
            df = pd.read_csv(
                mapping_file,
                dtype={'stock_id': str}
            )
            
            # 資料清理和標準化
            df['stock_id'] = df['stock_id'].str.strip()
            df['industry_category'] = df['industry_category'].fillna('其他類')
            
            # 建立映射字典
            self.industry_mappings = df.set_index('stock_id')['industry_category'].to_dict()
            
            # 驗證結果
            total_stocks = len(self.industry_mappings)
            self.logger.info(f"已載入 {total_stocks} 筆產業分類對應資料")
            
            # 保存一份有效產業列表
            self._valid_industries = set(df['industry_category'].unique())
            self.logger.info(f"識別出 {len(self._valid_industries)} 個產業類別")
            
        except Exception as e:
            self.logger.error(f"載入產業分類對應時發生錯誤: {str(e)}")
            self.industry_mappings = {}
    
    def _clean_cache(self):
        """清理快取機制"""
        try:
            # 清理產業快取
            if hasattr(self, 'industry_cache'):
                self.industry_cache.clear()
                
            # 清理技術指標快取
            if hasattr(self, 'technical_cache'):
                self.technical_cache.clear()
                
            # 清理產業映射快取
            if hasattr(self, '_industry_mapping_cache'):
                self._industry_mapping_cache.clear()
                
            # 清理其他暫存屬性
            for attr in dir(self):
                if isinstance(getattr(self, attr), pd.DataFrame):
                    delattr(self, attr)
            
            # 更新最後清理時間
            self.cache_tracker['last_cleanup'] = time.time()
            
            self.logger.info("快取清理完成")
            
        except Exception as e:
            self.logger.error(f"清理快取時發生錯誤: {str(e)}")

    def _emergency_cleanup(self):
        """緊急清理機制"""
        try:
            # 強制清理所有快取
            self._clean_cache()
            
            # 重置所有追蹤器
            self.memory_tracker['last_check'] = time.time()
            self.cache_tracker['last_cleanup'] = time.time()
            
            # 強制執行垃圾回收
            gc.collect()
            gc.collect()  # 執行兩次以確保徹底清理
            
            # 檢查清理效果
            current_memory = psutil.Process().memory_info().rss / 1024 / 1024 / 1024
            self.logger.info(f"緊急清理後的記憶體使用量: {current_memory:.2f}GB")
            
        except Exception as e:
            self.logger.error(f"緊急清理時發生錯誤: {str(e)}")

    def validate_data_structure(self) -> bool:
        """驗證資料結構完整性"""
        try:
            # 1. 檢查必要檔案
            required_files = {
                'companies_final.csv': ['stock_id', 'industry_category'],
                'industry_index.csv': ['日期', '指數名稱', '收盤指數'],
                'stock_data_whole.csv': ['證券代號', '日期', '收盤價']
            }
            
            for file, columns in required_files.items():
                path = self.config.meta_data_path / file
                if not path.exists():
                    self.logger.error(f"缺少必要檔案: {file}")
                    return False
                    
                df = pd.read_csv(path)
                missing_columns = [col for col in columns if col not in df.columns]
                if missing_columns:
                    self.logger.error(f"檔案 {file} 缺少欄位: {missing_columns}")
                    return False
            
            # 2. 檢查產業分析目錄結構
            required_dirs = [
                self.config.industry_analysis_path / "return_index",
                self.config.industry_analysis_path / "price_index"
            ]
            
            for dir_path in required_dirs:
                if not dir_path.exists():
                    self.logger.error(f"缺少必要目錄: {dir_path}")
                    return False
                    
            # 3. 檢查技術指標目錄
            tech_dir = self.config.base_path / "technical_analysis"
            if not tech_dir.exists():
                self.logger.error("缺少技術指標目錄")
                return False
            
            self.logger.info("資料結構驗證通過")
            return True
            
        except Exception as e:
            self.logger.error(f"資料結構驗證失敗: {str(e)}")
            return False

    def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """數據預處理"""
        try:
            df = df.copy()
            
            # 確保必要欄位存在
            required_columns = [
                '證券代號', '日期', '開盤價', '最高價', '最低價', '收盤價', '成交股數'
            ]
            
            missing_cols = [col for col in required_columns if col not in df.columns]
            if missing_cols:
                self.logger.error(f"缺少必要欄位: {missing_cols}")
                return None
                
            # 處理日期格式
            df['日期'] = pd.to_datetime(df['日期']).dt.strftime('%Y-%m-%d')
            
            # 處理價格欄位
            price_columns = ['開盤價', '最高價', '最低價', '收盤價']
            for col in price_columns:
                if col in df.columns:
                    df[col] = df[col].replace('--', np.nan)
                    df[col] = df[col].apply(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # 處理成交量相關欄位
            volume_columns = ['成交股數', '成交筆數', '成交金額']
            for col in volume_columns:
                if col in df.columns:
                    df[col] = df[col].replace('--', np.nan)
                    df[col] = df[col].apply(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
                    df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # 檢查數據有效性
            for col in required_columns:
                if df[col].isnull().all():
                    self.logger.error(f"欄位 {col} 全為空值")
                    return None
                    
            return df
            
        except Exception as e:
            self.logger.error(f"數據預處理失敗: {str(e)}")
            return None

    def _add_volume_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """添加量能特徵"""
        try:
            # 先確認必要欄位存在
            if '成交股數' not in df.columns:
                self.logger.error("缺少成交股數欄位")
                return df
                
            # 確認數據類型
            df['成交股數'] = pd.to_numeric(df['成交股數'], errors='coerce')
            
            params = self.config.FEATURE_PARAMS['volume']
            short_period = params['short_period']
            long_period = params['long_period']
            
            # 檢查數據有效性
            if df['成交股數'].isna().all():
                self.logger.error("成交股數欄位全為空值")
                return df
                
            # 計算特徵
            df['量比'] = df['成交股數'] / df['成交股數'].rolling(long_period).mean()
            df['量增率'] = df['成交股數'].pct_change()
            
            short_ma = df['成交股數'].rolling(short_period).mean()
            long_ma = df['成交股數'].rolling(long_period).mean()
            df['量能趨勢'] = short_ma / long_ma
            
            return df
            
        except Exception as e:
            self.logger.error(f"計算量能特徵時發生錯誤: {str(e)}")
            return df

    def _add_volatility_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """添加波動特徵"""
        try:
            params = self.config.FEATURE_PARAMS['volatility']
            short_period = params['short_period']
            long_period = params['long_period']
            
            # 計算日內波動率
            df['日內波動率'] = (df['最高價'] - df['最低價']) / df['開盤價']
            
            # 計算價格振幅
            df['振幅'] = (df['最高價'] - df['最低價']) / df['收盤價'].shift(1)
            
            # 計算漲跌幅
            df['漲跌幅'] = df['收盤價'].pct_change()
            
            # 計算波動率趨勢
            short_vol = df['收盤價'].rolling(short_period).std()
            long_vol = df['收盤價'].rolling(long_period).std()
            df['波動率趨勢'] = short_vol / long_vol
            
            return df
            
        except Exception as e:
            self.logger.error(f"計算波動特徵時發生錯誤: {str(e)}")
            return df

    def _add_trend_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """添加趨勢特徵"""
        try:
            params = self.config.FEATURE_PARAMS['trend']
            ma_period = params['ma_period']
            
            # 計算趨勢強度
            ma = df['收盤價'].rolling(ma_period).mean()
            df['趨勢強度'] = (df['收盤價'] - ma) / ma
            
            # 計算布林通道寬度變化
            std = df['收盤價'].rolling(window=20).std()
            upper = ma + (2 * std)
            lower = ma - (2 * std)
            
            # 通道寬度變化
            df['通道寬度'] = (upper - lower) / ma
            df['通道寬度變化'] = df['通道寬度'].pct_change()
            
            # 趨勢動能
            df['趨勢動能'] = df['收盤價'].diff(ma_period) / df['收盤價'].shift(ma_period)
            
            # 趨勢持續性 (使用趨勢強度的累積效果)
            df['趨勢持續性'] = df['趨勢強度'].rolling(ma_period).sum()
            
            return df
            
        except Exception as e:
            self.logger.error(f"計算趨勢特徵時發生錯誤: {str(e)}")
            return df

    def _get_industry_name(self, stock_id: str) -> List[str]:
        """獲取股票所屬產業
        
        結合兩個版本的優點:
        - 使用懶加載機制
        - 包含完整的錯誤處理
        - 保持原有的快取結構
        """
        try:
            # 確保stock_id為字串
            stock_id = str(stock_id).strip()
            
            # 使用類別層級的快取 
            if not hasattr(self, '_industry_mapping_cache'):
                self._industry_mapping_cache = {}
                self.logger.info("初始化產業映射快取...")
                try:
                    # 一次性載入所有對應關係
                    company_data = pd.read_csv(
                        self.config.meta_data_path / 'companies_final.csv',
                        dtype={'stock_id': str},
                        usecols=['stock_id', 'industry_category'] 
                    )
                    
                    # 清理和標準化資料
                    company_data['stock_id'] = company_data['stock_id'].str.strip()
                    company_data['industry_category'] = company_data['industry_category'].fillna('其他類')
                    
                    self._industry_mapping_cache = dict(zip(
                        company_data['stock_id'],
                        company_data['industry_category']
                    ))
                    
                    # 同時載入產業指數資料
                    industry_index = pd.read_csv(
                        self.config.meta_data_path / 'industry_index.csv'
                    )
                    self._valid_industries = set(
                        industry_index['指數名稱'].str.replace('報酬指數','').replace('指數','')
                    )
                    
                except Exception as e:
                    self.logger.error(f"無法載入產業對應資料: {str(e)}")
                    return []
            
            # 從快取取得產業
            industry = self._industry_mapping_cache.get(stock_id)
            if industry and industry in self._valid_industries:
                return [industry]
                
            return []
                
        except Exception as e:
            self.logger.error(f"獲取產業分類時發生錯誤: {str(e)}")
            return []

    def _standardize_industry_name(self, industry_name: str) -> str:
        """標準化產業名稱
        輸入: '水泥類' 或 '水泥'
        輸出: '水泥'  # 移除「類、業」字以符合檔案命名
        """
        try:
            # 移除所有後綴
            industry_name = (
                industry_name
                .replace('報酬指數', '')
                .replace('指數', '')
                .replace('類', '')  # 關鍵修改：移除「類」字
                .strip()
            )
            
            self.logger.debug(f"產業名稱標準化: 原始={industry_name} -> 標準化後={industry_name}")
            return industry_name
            
        except Exception as e:
            self.logger.error(f"產業名稱標準化失敗: {str(e)}")
            return industry_name

    def _read_industry_analysis(self, industry_name: str) -> Optional[Dict]:
        """讀取產業分析報告
        
        Args:
            industry_name (str): 產業名稱（如：'水泥類'）
            
        Returns:
            Optional[Dict]: 產業分析數據，如果讀取失敗則返回 None
        """
        try:
            # 標準化產業名稱 (保留「類」字)
            standardized_name = self._standardize_industry_name(industry_name)
            self.logger.debug(f"嘗試讀取產業分析 - 原始名稱: {industry_name}, 標準化後: {standardized_name}")
            
            # 取得日期參數
            start_date = self.config.INDUSTRY_PARAMS['analysis_start_date']
            end_date = self.config.INDUSTRY_PARAMS['analysis_end_date']
            report_date = datetime.now().strftime('%Y%m%d')
            
            # 嘗試多種可能的產業名稱格式
            possible_names = [
                standardized_name,
                f"{standardized_name}類",
                standardized_name.replace('類', '')
            ]
            
            # 對每個可能的名稱嘗試搜尋
            for try_name in possible_names:
                # 組合檔案名稱
                filename = f"{try_name}_{start_date}_{end_date}_{report_date}.json"
                
                # 搜尋路徑
                search_paths = [
                    self.config.industry_analysis_path / "return_index" / filename,
                    self.config.industry_analysis_path / "price_index" / filename
                ]
                
                # 檢查當天的檔案
                for path in search_paths:
                    self.logger.debug(f"檢查檔案: {path}")
                    if path.exists():
                        try:
                            with open(path, 'r', encoding='utf-8') as f:
                                data = json.load(f)
                                if self._validate_industry_data(data):
                                    self.logger.info(f"成功讀取產業 {try_name} 的分析資料")
                                    return data
                        except json.JSONDecodeError as je:
                            self.logger.error(f"JSON解析錯誤 {path}: {str(je)}")
                            continue
                        except Exception as e:
                            self.logger.error(f"讀取檔案 {path} 時發生錯誤: {str(e)}")
                            continue
                
                # 如果找不到當天的檔案，嘗試找最近的檔案
                self.logger.debug(f"找不到今天的報告，嘗試搜尋最近的 {try_name} 產業報告")
                
                # 使用模糊匹配搜尋最近的檔案
                for base_path in [self.config.industry_analysis_path / "return_index",
                                self.config.industry_analysis_path / "price_index"]:
                    try:
                        pattern = f"{try_name}_*.json"
                        matching_files = list(base_path.glob(pattern))
                        
                        if matching_files:
                            # 根據檔名日期排序
                            latest_file = max(matching_files, key=lambda x: x.name)
                            self.logger.info(f"使用最近的報告: {latest_file}")
                            
                            try:
                                with open(latest_file, 'r', encoding='utf-8') as f:
                                    data = json.load(f)
                                    if self._validate_industry_data(data):
                                        # 驗證產業名稱一致性
                                        if data.get('basic_info', {}).get('industry_name') == try_name:
                                            return data
                                        else:
                                            self.logger.warning(f"產業名稱不一致: 預期 {try_name}, 實際 {data.get('basic_info', {}).get('industry_name')}")
                            except Exception as e:
                                self.logger.error(f"讀取最近報告時發生錯誤: {str(e)}")
                    except Exception as e:
                        self.logger.error(f"搜尋檔案時發生錯誤: {str(e)}")
                        continue
            
            self.logger.warning(f"無法找到產業 {standardized_name} 的分析報告，已嘗試的名稱: {possible_names}")
            return None
            
        except Exception as e:
            self.logger.error(f"讀取產業 {industry_name} 分析報告時發生錯誤: {str(e)}\n{traceback.format_exc()}")
            return None

    def _validate_industry_data(self, data: Dict) -> bool:
        """驗證產業分析數據的完整性"""
        try:
            # 檢查基本資訊
            if 'basic_info' not in data:
                self.logger.warning("缺少基本資訊區段")
                return False
    
            # 檢查時間序列分析
            if 'time_series_analysis' not in data:
                self.logger.warning("缺少時間序列分析區段")
                return False
    
            # 檢查風險分析
            if 'risk_analysis' not in data:
                self.logger.warning("缺少風險分析區段")
                return False
    
            # 檢查輪動分析
            if 'rotation_analysis' not in data:
                self.logger.warning("缺少輪動分析區段")
                return False
                
            # 所有必要區段都存在，返回 True
            self.logger.info("產業分析資料驗證通過")
            return True
                
        except Exception as e:
            self.logger.error(f"驗證產業分析資料時發生錯誤: {str(e)}")
            return False

    def _verify_data_integrity(self):
        """驗證所需的資料檔案是否齊全"""
        try:
            # 從配置取得分析參數
            start_date = self.config.INDUSTRY_PARAMS['analysis_start_date']
            end_date = self.config.INDUSTRY_PARAMS['analysis_end_date']
            current_date = datetime.now().strftime('%Y%m%d')
            
            # 1. 檢查公司產業分類檔案
            companies_path = self.config.meta_data_path / 'companies_final.csv'
            if not companies_path.exists():
                self.logger.error("缺少公司產業分類檔案: companies_final.csv")
                return False
                
            # 2. 確認目錄結構
            required_dirs = [
                self.config.industry_analysis_path / "return_index",
                self.config.industry_analysis_path / "price_index"
            ]
            
            missing_dirs = [d for d in required_dirs if not d.exists()]
            if missing_dirs:
                self.logger.error(f"缺少以下目錄: {missing_dirs}")
                return False
                
            # 3. 檢查是否有產業分析檔案
            # 放寬搜尋條件，使用多個模式
            file_patterns = [
                f"*_{start_date}_{end_date}_{current_date}.json",  # 今天的檔案
                f"*_{start_date}_{end_date}_*.json",               # 任意日期的檔案
                "*.json"                                           # 所有json檔案
            ]
            
            found_files = []
            for dir_path in required_dirs:
                for pattern in file_patterns:
                    found_files.extend(list(dir_path.glob(pattern)))
                    if found_files:  # 如果找到檔案就停止搜尋
                        break
            
            if not found_files:
                self.logger.error(
                    f"未找到任何產業分析檔案\n"
                    f"分析日期範圍: {start_date} 到 {end_date}\n"
                    f"嘗試的檔案模式:\n"
                    f"  - {file_patterns[0]}\n"
                    f"  - {file_patterns[1]}\n"
                    f"  - {file_patterns[2]}\n"
                    f"搜尋目錄:\n"
                    f"  - {required_dirs[0]}\n"
                    f"  - {required_dirs[1]}"
                )
                
                # 嘗試列出目錄內容
                self.logger.info("目錄內容檢查:")
                for dir_path in required_dirs:
                    if dir_path.exists():
                        files = list(dir_path.glob("*"))
                        self.logger.info(f"\n{dir_path} 內容:")
                        for f in files[:5]:  # 只顯示前5個檔案
                            self.logger.info(f"  - {f.name}")
                    else:
                        self.logger.info(f"\n{dir_path} 目錄不存在")
                
                # 暫時允許繼續執行
                self.logger.warning("未找到產業分析檔案，但允許繼續執行")
                return True  # 改為 True
                
            self.logger.info(f"找到 {len(found_files)} 個產業分析檔案")
            for f in found_files[:5]:  # 只顯示前5個
                self.logger.info(f"- {f.name}")
                
            return True
            
        except Exception as e:
            self.logger.error(f"驗證資料完整性時發生錯誤: {str(e)}")
            # 暫時允許繼續執行
            self.logger.warning("發生錯誤，但允許繼續執行")
            return True  # 改為 True

    def get_industry_index_data(self, industry_name: str) -> pd.DataFrame:
        """獲取產業指數數據"""
        try:
            # 讀取產業指數數據
            industry_data = pd.read_csv(
                self.config.meta_data_path / 'industry_index.csv',
                dtype={'指數名稱': str}
            )
            
            # 將日期轉換為datetime格式
            industry_data['日期'] = pd.to_datetime(industry_data['日期'])
            
            # 選擇特定產業的數據（優先使用報酬指數）
            industry_return = industry_data[
                industry_data['指數名稱'] == f'{industry_name}類報酬指數'
            ]
            
            if industry_return.empty:
                # 如果沒有報酬指數，使用價格指數
                industry_return = industry_data[
                    industry_data['指數名稱'] == f'{industry_name}類指數'
                ]
            
            if industry_return.empty:
                self.logger.warning(f"找不到產業 {industry_name} 的指數數據")
                return None
                
            return industry_return
            
        except Exception as e:
            self.logger.error(f"讀取產業指數數據時發生錯誤: {str(e)}")
            return None

    def _add_industry_risk_features(self, df: pd.DataFrame, industry_data: dict, prefix: str) -> pd.DataFrame:
        """添加產業風險特徵"""
        try:
            if 'risk_analysis' in industry_data:
                risk = industry_data['risk_analysis']
                
                # 基礎風險指標
                if 'ratios' in risk:
                    for key, value in risk['ratios'].items():
                        if value is not None:  # 增加空值檢查
                            df[f'{prefix}{key}'] = value
                
                # 下檔風險指標    
                if 'downside' in risk:
                    for key, value in risk['downside'].items():
                        if value is not None:  # 增加空值檢查
                            df[f'{prefix}{key}'] = value
                
                # 尾部風險指標
                if 'tail_risk' in risk:
                    for key, value in risk['tail_risk'].items():
                        if value is not None:  # 增加空值檢查
                            df[f'{prefix}{key}'] = value
            
            return df
        except Exception as e:
            self.logger.error(f"添加產業風險特徵錯誤: {str(e)}")
            return df
    
    def _add_industry_rotation_features(self, df: pd.DataFrame, industry_data: dict, prefix: str) -> pd.DataFrame:
        """添加產業輪動特徵"""
        try:
            if 'rotation_analysis' in industry_data:
                rotation = industry_data['rotation_analysis']
                
                # 處理強度排名
                if 'strength_ranking' in rotation:
                    # 將單一值擴展為與DataFrame同樣長度的序列
                    for key, value in rotation['strength_ranking'].items():
                        if value is not None:
                            df[f'{prefix}{key}'] = pd.Series([value] * len(df), index=df.index)
                
                # 處理動能排名
                if 'momentum_ranking' in rotation and 'scores' in rotation['momentum_ranking']:
                    for key, value in rotation['momentum_ranking']['scores'].items():
                        if value is not None:
                            df[f'{prefix}{key}'] = pd.Series([value] * len(df), index=df.index)
            
            return df
        except Exception as e:
            self.logger.error(f"添加產業輪動特徵錯誤: {str(e)}")
            return df

    def _get_industry_analysis(self, industry_name: str, date: str) -> dict:
        """獲取產業分析數據
        
        Args:
            industry_name (str): 產業名稱
            date (str): 日期 (YYYY-MM-DD)
            
        Returns:
            dict: 產業分析數據
        """
        try:
            # 讀取產業日分析數據
            industry_daily_path = self.config.industry_analysis_path / \
                f"{industry_name}/daily_analysis.csv"
                
            if not industry_daily_path.exists():
                self.logger.error(f"找不到產業 {industry_name} 的日分析數據")
                return {}
                
            industry_data = pd.read_csv(
                industry_daily_path,
                parse_dates=['日期']
            )
            
            # 獲取指定日期的分析數據
            daily_data = industry_data[
                industry_data['日期'] == pd.to_datetime(date)
            ]
            
            if daily_data.empty:
                self.logger.warning(f"日期 {date} 無產業分析數據")
                return {}
                
            # 讀取產業月分析數據（用於趨勢分析）
            industry_monthly_path = self.config.industry_analysis_path / \
                f"{industry_name}/monthly_analysis.csv"
                
            if industry_monthly_path.exists():
                monthly_data = pd.read_csv(
                    industry_monthly_path,
                    parse_dates=['月份']
                )
                
                # 獲取當月數據
                current_month = pd.to_datetime(date).strftime('%Y-%m')
                monthly_info = monthly_data[
                    monthly_data['月份'].dt.strftime('%Y-%m') == current_month
                ].iloc[0].to_dict() if not monthly_data[
                    monthly_data['月份'].dt.strftime('%Y-%m') == current_month
                ].empty else {}
            else:
                monthly_info = {}
                
            # 整合日度和月度數據
            analysis_result = {
                # 基本資訊
                '產業名稱': industry_name,
                '日期': date,
                
                # 日度數據
                '產業指數': daily_data['產業指數'].iloc[0] if not daily_data.empty else None,
                '成交金額': daily_data['成交金額'].iloc[0] if not daily_data.empty else None,
                '漲跌幅': daily_data['漲跌幅'].iloc[0] if not daily_data.empty else None,
                
                # 技術指標
                'RSI': daily_data['RSI'].iloc[0] if not daily_data.empty else None,
                'MACD': daily_data['MACD'].iloc[0] if not daily_data.empty else None,
                
                # 月度趨勢數據
                '月均指數': monthly_info.get('月均指數'),
                '月漲跌幅': monthly_info.get('月漲跌幅'),
                '產業地位': monthly_info.get('產業地位'),  # 產業強弱排名
                '資金流向': monthly_info.get('資金流向'),  # 淨流入/流出
                '產業週期位置': monthly_info.get('產業週期位置')  # 衰退/復甦/成長/高峰
            }
            
            return analysis_result
            
        except Exception as e:
            self.logger.error(f"獲取產業分析數據時發生錯誤: {str(e)}")
            return {}

    def _verify_industry_files(self, industry_name: str) -> bool:
        """驗證產業分析檔案是否存在"""
        try:
            start_date = self.config.INDUSTRY_PARAMS['analysis_start_date']
            end_date = self.config.INDUSTRY_PARAMS['analysis_end_date']
            report_date = datetime.now().strftime('%Y%m%d')
            
            # 檢查報酬指數和價格指數檔案
            return_path = self.config.industry_analysis_path / "return_index" / \
                f"{industry_name}_{start_date}_{end_date}_{report_date}.json"
            price_path = self.config.industry_analysis_path / "price_index" / \
                f"{industry_name}_{start_date}_{end_date}_{report_date}.json"
            
            self.logger.info(f"檢查產業分析檔案:\n{return_path}\n{price_path}")
            return return_path.exists() or price_path.exists()
        except Exception as e:
            self.logger.error(f"檢查產業分析檔案時發生錯誤: {str(e)}")
            return False

    def _get_industry_index_data(self, industry_name: str) -> pd.DataFrame:
        """獲取產業指數資料，優先使用報酬指數"""
        try:
            industry_data = pd.read_csv(
                self.config.meta_data_path / 'industry_index.csv'
            )
            
            # 增加更多的產業名稱匹配可能性
            possible_names = [
                f"{industry_name}報酬指數",
                f"{industry_name}指數",
                industry_name
            ]
            
            # 檢查所有可能的名稱
            for name in possible_names:
                data = industry_data[industry_data['指數名稱'] == name]
                if not data.empty:
                    self.logger.info(f"使用產業指數: {name}")
                    return data
            
            self.logger.warning(f"找不到產業 {industry_name} 的指數資料")
            self.logger.debug(f"已嘗試的名稱: {possible_names}")
            return None
            
        except Exception as e:
            self.logger.error(f"讀取產業指數資料時發生錯誤: {str(e)}")
            return None

    def _get_industry_data(self, industry_name: str) -> Tuple[pd.DataFrame, dict]:
        """整合產業數據獲取相關功能
        
        Args:
            industry_name (str): 產業名稱
            
        Returns:
            Tuple[pd.DataFrame, dict]: 
                - 第一個元素是產業指數數據 DataFrame
                - 第二個元素是產業分析數據 dictionary
        """
        try:
            # 標準化產業名稱
            industry_name = self._standardize_industry_name(industry_name)
            
            # 獲取產業指數數據
            index_data = self._get_industry_index_data(industry_name)
            
            # 獲取產業分析數據
            analysis_data = self._read_industry_analysis(industry_name)
            
            # 加入數據驗證
            if index_data is None or analysis_data is None:
                self.logger.warning(f"產業 {industry_name} 部分數據獲取失敗")
            
            return index_data, analysis_data
            
        except Exception as e:
            self.logger.error(f"獲取產業 {industry_name} 數據時發生錯誤: {str(e)}")
            return None, None

    def _add_industry_features(self, df: pd.DataFrame, stock_id: str) -> pd.DataFrame:
        """整合所有產業特徵"""
        try:
            # 獲取股票所屬產業
            industries = self._get_industry_name(stock_id)
            self.logger.debug(f"股票 {stock_id} 的產業分類: {industries}")
            if not industries:
                self.logger.warning(f"股票 {stock_id} 無產業分類信息")
                return df
            
            # 處理每個產業
            for industry in industries:
                self.logger.debug(f"處理產業 {industry} 的特徵")
                # 使用新的整合方法獲取數據
                industry_data = self._read_industry_analysis(industry)
                
                if industry_data is None:
                    self.logger.warning(f"股票 {stock_id} 的產業 {industry} 數據獲取失敗")
                    continue
                    
                # 添加產業名稱前綴
                prefix = f"{industry}_"
                
                # 添加各類產業特徵
                df = self._add_industry_risk_features(df, industry_data, prefix)
                df = self._add_industry_rotation_features(df, industry_data, prefix)
                df = self._add_industry_correlation_features(df, stock_id, prefix)
                
                # 計算相對表現
                df = self._calculate_performance_metrics(df, industry_data, prefix)
            
            return df
            
        except Exception as e:
            self.logger.error(f"添加產業特徵過程中發生錯誤: {str(e)}")
            return df

    def _calculate_performance_metrics(self, df: pd.DataFrame, industry_data: dict, prefix: str = '') -> pd.DataFrame:
        """計算績效指標（整合個股和產業的相對表現計算）"""
        try:
            # 基礎技術評分比較
            if '技術綜合評分' in df.columns:
                # 個股相對產業的技術表現
                if 'industry_score' in industry_data:
                    df[f'{prefix}相對技術評分'] = df['技術綜合評分'] / industry_data['industry_score']
                
                # 產業整體技術表現
                if 'industry_technical_score' in industry_data:
                    df[f'{prefix}產業技術評分'] = industry_data['industry_technical_score']
            
            # 動能比較
            if '趨勢動能' in df.columns:
                # 個股動能
                if 'momentum' in industry_data:
                    df[f'{prefix}相對動能'] = df['趨勢動能'] / industry_data['momentum']
                
                # 產業動能
                if 'industry_momentum' in industry_data:
                    df[f'{prefix}產業動能'] = industry_data['industry_momentum']
            
            # 風險調整後的綜合表現
            risk_columns = [f'{prefix}VaR95', f'{prefix}最大回撤']
            if all(col in df.columns for col in risk_columns):
                df[f'{prefix}風險調整績效'] = (
                    df[f'{prefix}相對技術評分'] * 
                    (1 + abs(df[f'{prefix}VaR95'])) * 
                    (1 + abs(df[f'{prefix}最大回撤']))
                )
            
            # 產業相對強度
            if all(x in industry_data for x in ['industry_rank', 'total_industries']):
                df[f'{prefix}產業相對強度'] = (
                    1 - (industry_data['industry_rank'] - 1) / industry_data['total_industries']
                )
            
            return df
            
        except Exception as e:
            self.logger.error(f"計算績效指標時發生錯誤: {str(e)}")
            return df

    def _add_industry_correlation_features(self, df: pd.DataFrame, stock_id: str, prefix: str = "") -> pd.DataFrame:
        """添加產業相關性特徵"""
        try:
            # 讀取最新的產業相關性數據
            current_date = datetime.now().strftime('%Y%m%d')
            correlation_path = self.config.industry_correlation_path / 'monthly' / f'industry_correlation_{current_date}.csv'
            
            if correlation_path.exists():
                corr_df = pd.read_csv(correlation_path)
                industries = self._get_industry_name(stock_id)
                
                for industry in industries:
                    # 獲取該產業與其他產業的相關性
                    if industry in corr_df.columns:
                        industry_corr = corr_df[industry]
                        df[f'{prefix}相關性_最大值'] = industry_corr.max()
                        df[f'{prefix}相關性_最小值'] = industry_corr.min()
                        df[f'{prefix}相關性_平均值'] = industry_corr.mean()
            
            return df
            
        except Exception as e:
            self.logger.error(f"添加產業相關性特徵錯誤: {str(e)}")
            return df

    def _add_technical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """添加技術面特徵"""
        try:
            # 檢查KD指標來源
            kd_sources = []
            if 'slowk' in df.columns and 'slowd' in df.columns:
                # 增加 KD 值的有效性檢查
                kd_valid = (
                    df[['slowk', 'slowd']].notna().any().all() and
                    not (df[['slowk', 'slowd']] == 0).all().any()
                )
                if kd_valid:
                    kd_sources.append('direct')
                    self.logger.debug("發現有效的現有KD指標")
                else:
                    self.logger.warning("現有KD指標數據無效，嘗試重新計算")
                    kd_sources.append('calculate')
            elif all(col in df.columns for col in ['最高價', '最低價', '收盤價']):
                kd_sources.append('calculate')
                    
            # 處理KD指標
            if 'direct' in kd_sources:
                self.logger.info("使用現有的KD指標")
                df['KD_差值'] = df['slowk'] - df['slowd']
                
                # 增加數據品質檢查
                kd_null_ratio = df[['slowk', 'slowd', 'KD_差值']].isna().mean()
                if (kd_null_ratio > 0.1).any():
                    self.logger.warning(f"KD指標存在較高空值比例: {kd_null_ratio.max():.2%}")
                    
            elif 'calculate' in kd_sources:
                self.logger.info("計算新的KD指標")
                try:
                    # 確保價格數據有效
                    price_data_valid = (
                        df[['最高價', '最低價', '收盤價']].notna().all(axis=1).any()
                    )
                    
                    if price_data_valid:
                        high = df['最高價'].values
                        low = df['最低價'].values
                        close = df['收盤價'].values
                        
                        # 使用TA-Lib計算KD
                        slowk, slowd = talib.STOCH(
                            high, low, close,
                            fastk_period=9,
                            slowk_period=3,
                            slowk_matype=0,
                            slowd_period=3,
                            slowd_matype=0
                        )
                        
                        df['slowk'] = slowk
                        df['slowd'] = slowd
                        df['KD_差值'] = slowk - slowd
                        
                        # 驗證計算結果
                        if df[['slowk', 'slowd', 'KD_差值']].isna().all().any():
                            self.logger.warning("KD指標計算結果包含完整空值列")
                    else:
                        self.logger.warning("價格數據無效，無法計算KD指標")
                        
                except Exception as e:
                    self.logger.error(f"KD指標計算失敗: {str(e)}")
            else:
                self.logger.warning("無法計算KD指標 - 缺少必要數據")
            
            # 計算均線糾結度
            ma_cols = ['SMA30', 'DEMA30', 'EMA30']
            if all(col in df.columns for col in ma_cols):
                deviations = []
                for i in range(len(ma_cols)):
                    for j in range(i+1, len(ma_cols)):
                        deviation = abs(df[ma_cols[i]] - df[ma_cols[j]]) / df[ma_cols[i]]
                        deviations.append(deviation)
                
                # 計算均線糾結度並處理極端值
                df['均線糾結度'] = 1 - pd.concat(deviations, axis=1).mean(axis=1)
                df['均線糾結度'] = df['均線糾結度'].clip(0, 1)  # 確保在[0,1]範圍內
                
                # 使用前向填充處理開始的NaN值
                df['均線糾結度'] = df['均線糾結度'].ffill()
            
            # RSI動能
            if 'RSI' in df.columns:
                df['RSI_動能'] = df['RSI'].diff() / df['RSI'].shift(1)
            
            # MACD動能
            if 'MACD_hist' in df.columns:
                df['MACD_動能'] = df['MACD_hist'].diff()
            
            # 波動率
            df['波動率'] = df['收盤價'].rolling(window=20).std() / df['收盤價'].rolling(window=20).mean()
            
            # 本益比相對值
            if '本益比' in df.columns:
                # 轉換前先驗證數據
                df['本益比'] = df['本益比'].apply(lambda x: 
                    pd.to_numeric(str(x).replace('--', 'nan'), errors='coerce')
                )
                
                if df['本益比'].notna().any():
                    df['本益比_相對值'] = (
                        (df['本益比'] - df['本益比'].rolling(window=30).min()) /
                        (df['本益比'].rolling(window=30).max() - df['本益比'].rolling(window=30).min())
                    )
                    
                    # 驗證計算結果
                    null_ratio = df['本益比_相對值'].isna().mean()
                    if null_ratio > 0.5:
                        self.logger.warning(f"本益比相對值計算結果空值比例過高: {null_ratio:.2%}")
                else:
                    self.logger.warning("本益比數據全為空值")
            
            # 技術綜合評分
            technical_scores = []
            
            if 'RSI' in df.columns:
                rsi_score = 1 - abs(df['RSI'] - 50) / 50
                technical_scores.append(rsi_score)
            
            if all(col in df.columns for col in ['MACD', 'MACD_signal']):
                macd_score = (df['MACD'] - df['MACD_signal']).apply(lambda x: 1 / (1 + np.exp(-x)))
                technical_scores.append(macd_score)
            
            if '均線糾結度' in df.columns:
                technical_scores.append(df['均線糾結度'])
                
            if technical_scores:
                df['技術綜合評分'] = pd.concat(technical_scores, axis=1).mean(axis=1)
            
            return df
            
        except Exception as e:
            self.logger.error(f"計算技術特徵時發生錯誤: {str(e)}")
            return df

    def generate_features(self, df: pd.DataFrame, stock_id: str) -> pd.DataFrame:
        """生成特徵並與現有技術指標合併"""
        try:
            # 檢查記憶體使用
            self._check_memory_usage()
            
            # 基本驗證
            if df is None or df.empty:
                self.logger.warning(f"股票 {stock_id} 無資料")
                return None
                
            # 驗證必要欄位
            required_cols = ['開盤價', '最高價', '最低價', '收盤價', '成交股數']
            missing_cols = [col for col in required_cols if col not in df.columns]
            if missing_cols:
                self.logger.warning(f"股票 {stock_id} 缺少必要欄位: {missing_cols}")
                return None
                
            # 數據預處理
            df = self._preprocess_data(df)
            if df is None:
                return None
                
            # 使用較小的數據片段進行處理
            try:
                # 添加各類特徵
                df = self._add_volume_features(df)
                gc.collect()  # 第一階段清理
                
                df = self._add_volatility_features(df)
                gc.collect()  # 第二階段清理
                
                df = self._add_trend_features(df)
                gc.collect()  # 第三階段清理
                
                df = self._add_technical_features(df)
                gc.collect()  # 第四階段清理
                
                # 產業特徵處理
                try:
                    df = self._add_industry_features(df, stock_id)
                    gc.collect()  # 第五階段清理
                except Exception as e:
                    self.logger.warning(f"產業特徵處理失敗: {str(e)}")
                    
                # 清理無效值
                df = df.replace([np.inf, -np.inf], np.nan)
                
                # 最終檢查
                if df is not None and not df.empty:
                    null_cols = df.columns[df.isnull().all()].tolist()
                    if null_cols:
                        self.logger.warning(f"股票 {stock_id} 以下特徵全為空值: {null_cols}")
                        
                # 最終記憶體檢查
                self._check_memory_usage()
                
                return df
                
            except Exception as e:
                self.logger.error(f"特徵生成過程發生錯誤: {str(e)}")
                return None
                
            finally:
                # 確保清理臨時資源
                gc.collect()
                
        except Exception as e:
            self.logger.error(f"生成特徵時發生錯誤: {str(e)}")
            return None

    def process_stock_data(self, stock_id: str, df: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[str]]:
        """處理單支股票的資料"""
        try:
            # 基本資料驗證
            if not self._validate_input_data(df, stock_id):
                return None, "資料驗證失敗"
                
            # 準備基礎數據
            df = self._prepare_base_data(df, stock_id)
            if df is None:
                return None, "基礎數據準備失敗"
                
            # 生成特徵
            df = self.generate_features(df, stock_id)
            if df is None:
                return None, "特徵生成失敗"
                
            # 特徵驗證
            if not validate_feature_result(df, stock_id):
                return None, "特徵驗證失敗"
                
            return df, None
            
        except Exception as e:
            self.logger.error(f"處理股票 {stock_id} 時發生錯誤: {str(e)}")
            return None, str(e)
            
    def _validate_input_data(self, df: pd.DataFrame, stock_id: str) -> bool:
        """驗證輸入數據的有效性"""
        if df.empty:
            self.logger.debug(f"股票 {stock_id} 數據為空")
            return False
            
        required_columns = ['證券代號', '日期', '開盤價', '最高價', '最低價', '收盤價', '成交股數']
        if not all(col in df.columns for col in required_columns):
            self.logger.debug(f"股票 {stock_id} 缺少必要欄位")
            return False
            
        return True
    
    def _prepare_base_data(self, df: pd.DataFrame, stock_id: str) -> pd.DataFrame:
        """準備基礎數據"""
        try:
            # 1. 數據預處理
            df = self._preprocess_data(df)
            if df is None:
                return None
                
            # 2. 清理重複的列名
            df.columns = [col.split('_x')[0].split('_y')[0] for col in df.columns]
            
            # 3. 合併技術指標數據
            df = self._merge_technical_indicators(df, stock_id)
            
            return df
            
        except Exception as e:
            self.logger.error(f"準備基礎數據時發生錯誤: {str(e)}")
            return None
    
    def _merge_technical_indicators(self, df: pd.DataFrame, stock_id: str) -> pd.DataFrame:
        """合併技術指標數據"""
        try:
            tech_path = Path(f"D:/Min/Python/Project/FA_Data/technical_analysis/{stock_id}_indicators.csv")
            if not tech_path.exists():
                self.logger.warning(f"股票 {stock_id} 無技術指標文件，將跳過技術指標合併")
                return df
                
            try:
                tech_df = pd.read_csv(tech_path, dtype={'證券代號': str})
                tech_df['日期'] = pd.to_datetime(tech_df['日期']).dt.strftime('%Y-%m-%d')
                
                # 合併前檢查數據
                if tech_df.empty:
                    self.logger.warning(f"股票 {stock_id} 的技術指標文件為空")
                    return df
                    
                # 合併數據
                df = pd.merge(
                    df, 
                    tech_df,
                    on=['證券代號', '日期'],
                    how='left',
                    suffixes=('', '_drop')
                )
                
                # 刪除重複列
                df = df.loc[:, ~df.columns.str.endswith('_drop')]
                
                # 檢查合併結果
                if df.empty:
                    self.logger.warning(f"股票 {stock_id} 合併後數據為空")
                    return None
                    
                return df
                
            except Exception as e:
                self.logger.error(f"讀取技術指標文件時發生錯誤: {str(e)}")
                return df
                
        except Exception as e:
            self.logger.error(f"合併技術指標時發生錯誤: {str(e)}")
            return df
    
    def _generate_all_features(self, df: pd.DataFrame, stock_id: str) -> pd.DataFrame:
        """生成所有特徵"""
        try:
            # 1. 添加基本特徵
            df = self._add_volume_features(df)
            df = self._add_volatility_features(df)
            df = self._add_trend_features(df)
            
            # 2. 添加產業特徵
            industries = self._get_industry_name(stock_id)
            if industries:
                for industry in industries:
                    # 讀取產業分析報告
                    industry_data = self._read_industry_analysis(industry)
                    if industry_data:
                        # 添加產業名稱前綴
                        prefix = f"{industry}_"  # 準備前綴
                        
                        # 添加各類產業特徵 - 修改這裡的函數調用，確保傳入prefix
                        df = self._add_industry_risk_features(df, industry_data, prefix)
                        df = self._add_industry_rotation_features(df, industry_data, prefix)  # 修改這行
                        df = self._add_industry_correlation_features(df, stock_id)  # 這個函數可能也需要加prefix
                        
                        # 計算相對表現
                        df = self._calculate_performance_metrics(df, industry_data, prefix)
            
            # 3. 添加技術特徵
            df = self._add_technical_features(df)
            
            return df
            
        except Exception as e:
            self.logger.error(f"生成特徵時發生錯誤: {str(e)}")
            return df
    
    def _post_process_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """特徵後處理"""
        try:
            # 1. 處理無限值
            df = df.replace([np.inf, -np.inf], np.nan)
            
            # 2. 處理重要特徵的空值
            important_features = [
                '技術綜合評分', '產業綜合評分', '技術產業綜合比較',
                '個股產業強度比較', '個股產業動能比較', '風險調整後產業表現'
            ]
            
            # 確保特徵存在再進行填充
            existing_features = [f for f in important_features if f in df.columns]
            if existing_features:
                df[existing_features] = df[existing_features].fillna(method='ffill')
                
            return df
            
        except Exception as e:
            self.logger.error(f"特徵後處理時發生錯誤: {str(e)}")
            return df

In [None]:
def main():
    """主程序"""
    try:
        # 初始化
        config = FeatureConfig()
        logger = FeatureLogger(config)
        logger.info("開始執行特徵生成主程序")
        
        # 設定記憶體監控參數
        MEMORY_THRESHOLD = 8  # GB
        BATCH_SIZE = 200  # 每批處理的股票數
        
        # 驗證配置和數據結構
        generator = FeatureGenerator(config)
        if not config.validate_config():
            logger.error("配置驗證失敗")
            return False
            
        if not generator.validate_data_structure():
            logger.error("資料結構驗證失敗")
            return False
        
        # 檢查產業資料完整性
        logger.info("檢查產業資料完整性...")
        if not generator._verify_data_integrity():
            logger.error("產業資料驗證失敗")
            return False
            
        # 測試股票-產業對應
        test_stocks = {
            '2330': '半導體',
            '2317': '其他電子',
            '2454': '半導體',
            '1101': '水泥',
            '2308': '電子工業'
        }
        
        # 檢查記憶體使用
        current_memory = psutil.Process().memory_info().rss / 1024 / 1024 / 1024
        if current_memory > MEMORY_THRESHOLD:
            logger.warning(f"初始記憶體使用量偏高: {current_memory:.2f}GB")
            gc.collect()
        
        logger.info("測試股票產業對應:")
        for stock_id, expected_industry in test_stocks.items():
            industries = generator._get_industry_name(stock_id)
            if not industries:
                logger.warning(f"股票 {stock_id} 無法獲取產業分類 (預期: {expected_industry}類)")
            else:
                logger.info(f"股票 {stock_id} 產業分類: {industries}")
                
                # 測試產業名稱標準化和檔案存在
                for industry in industries:
                    original_name = industry
                    standardized_name = generator._standardize_industry_name(industry)
                    logger.info(f"產業名稱標準化: {original_name} -> {standardized_name}")
                    
                    # 檢查產業指數讀取
                    index_data = generator._get_industry_index_data(industry)
                    if index_data is None:
                        logger.warning(f"無法讀取產業 {industry} 的指數資料")
                    else:
                        logger.info(f"成功讀取產業 {industry} 的指數資料")
        
        # 列出產業分析檔案
        logger.info("\n檢查產業分析檔案:")
        for subdir in ['return_index', 'price_index']:
            path = config.industry_analysis_path / subdir
            if path.exists():
                files = list(path.glob("*.json"))
                logger.info(f"\n{subdir} 目錄中的檔案:")
                for f in files[:5]:
                    logger.info(f"- {f.name}")
        
        # 分批處理所有股票
        logger.info("\n開始執行特徵生成...")
        
        start_time = time.time()
        success = process_all_stocks(config)
        
        if success:
            logger.info("特徵生成完成,開始驗證...")
            if verify_features(config):
                process_time = time.time() - start_time
                logger.info(f"特徵驗證通過, 總處理時間: {process_time:.2f}秒")
            else:
                logger.warning("特徵驗證發現問題")
                
        # 最終清理
        gc.collect()
        current_memory = psutil.Process().memory_info().rss / 1024 / 1024 / 1024
        logger.info(f"結束時記憶體使用量: {current_memory:.2f}GB")
                
        return success
        
    except Exception as e:
        logger.error(f"主程序執行錯誤: {str(e)}")
        logger.error(f"錯誤詳情:\n{traceback.format_exc()}")
        return False
        
def validate_stock_data(self, df: pd.DataFrame, stock_id: str) -> bool:
    """驗證股票資料完整性"""
    try:
        # 檢查必要欄位
        required_cols = ['開盤價', '最高價', '最低價', '收盤價', '成交股數']
        if not all(col in df.columns for col in required_cols):
            self.logger.warning(f"股票 {stock_id} 缺少必要欄位")
            return False
            
        # 檢查數據是否為空
        if df.empty:
            self.logger.warning(f"股票 {stock_id} 無交易資料")
            return False
            
        # 檢查價格欄位有效性
        for col in ['開盤價', '最高價', '最低價', '收盤價']:
            valid_ratio = pd.to_numeric(df[col], errors='coerce').notna().mean()
            if valid_ratio < 0.5:  # 少於50%有效資料
                self.logger.warning(f"股票 {stock_id} 的 {col} 有效資料比例過低: {valid_ratio:.2%}")
                return False
                
        # 檢查成交量有效性
        volume_valid_ratio = pd.to_numeric(df['成交股數'], errors='coerce').notna().mean()
        if volume_valid_ratio < 0.5:
            self.logger.warning(f"股票 {stock_id} 的成交量有效資料比例過低: {volume_valid_ratio:.2%}")
            return False
            
        return True
        
    except Exception as e:
        self.logger.error(f"驗證股票 {stock_id} 資料時發生錯誤: {str(e)}")
        return False

def safe_convert_numeric(x):
    """安全的數值轉換函數"""
    if pd.isna(x) or str(x).strip() in ['', '--']:
        return np.nan
    try:
        return float(str(x).replace(',', ''))
    except (ValueError, TypeError):
        return np.nan

def validate_numeric_data(df: pd.DataFrame, columns: List[str], logger) -> bool:
    """驗證數值欄位的有效性"""
    for col in columns:
        if col in df.columns:
            valid_ratio = df[col].notna().mean()
            if valid_ratio < 0.5:
                logger.warning(f"欄位 {col} 的有效數據比例過低: {valid_ratio:.2%}")
                return False
    return True

def validate_date_range(df: pd.DataFrame, logger) -> bool:
    """驗證日期範圍的有效性"""
    try:
        # 轉換日期格式
        df['日期'] = pd.to_datetime(df['日期'])
        
        # 檢查基本日期範圍
        date_range = (df['日期'].max() - df['日期'].min()).days
        if date_range <= 0:
            logger.error(f"日期範圍異常: {df['日期'].min()} 到 {df['日期'].max()}")
            return False
            
        # 針對每支股票檢查日期連續性
        invalid_stocks = []
        for stock_id in df['證券代號'].unique():
            stock_df = df[df['證券代號'] == stock_id].sort_values('日期')
            
            # 計算交易日間隔
            date_diff = stock_df['日期'].diff().dt.days
            
            # 排除週末的影響(假設正常間隔應該不超過5個工作日)
            abnormal_gaps = date_diff[date_diff > 5].count()
            if abnormal_gaps > 0:
                logger.debug(f"股票 {stock_id} 有 {abnormal_gaps} 筆異常間隔")
                
                # 如果異常間隔太多(超過總資料量的20%),才視為問題
                if abnormal_gaps / len(stock_df) > 0.2:
                    invalid_stocks.append(stock_id)
        
        if invalid_stocks:
            logger.warning(f"發現 {len(invalid_stocks)} 支股票有嚴重的日期不連續問題")
            logger.debug(f"問題股票: {invalid_stocks[:5]}...")
            # 但仍然回傳 True,因為這應該是警告而不是錯誤
            
        return True
        
    except Exception as e:
        logger.error(f"日期驗證失敗: {str(e)}")
        return False

def process_all_stocks(config: FeatureConfig, date_range: Optional[Tuple[str, str]] = None) -> bool:
    """主要處理函數"""
    try:
        logger = FeatureLogger(config)
        logger.info("開始特徵生成處理")
        
        # 設定批次處理參數
        BATCH_SIZE = 200  # 每批處理200支股票
        MEMORY_THRESHOLD = 12  # 記憶體警戒值(GB)
        
        # 讀取股票主檔
        logger.info("讀取股票主檔...")
        try:
            # 使用分批讀取方式
            chunks = []
            for chunk in pd.read_csv(
                config.get_stock_data_path(),
                dtype={
                    '證券代號': str,
                    '證券名稱': str,
                    '日期': str
                },
                chunksize=50000  # 每次讀取50000行
            ):
                chunks.append(chunk)
            main_df = pd.concat(chunks, ignore_index=True)
            
            # 處理價格欄位
            price_cols = ['開盤價', '最高價', '最低價', '收盤價']
            for col in price_cols:
                if col in main_df.columns:
                    main_df[col] = main_df[col].apply(safe_convert_numeric)
            
            # 處理成交量相關欄位
            volume_cols = ['成交股數', '成交筆數', '成交金額']
            for col in volume_cols:
                if col in main_df.columns:
                    main_df[col] = main_df[col].apply(safe_convert_numeric)
                    
            logger.info(f"載入 {len(main_df):,} 筆資料")
            
            # 驗證數據完整性
            if not validate_numeric_data(main_df, price_cols + volume_cols, logger):
                logger.error("數據驗證失敗")
                return False
                
            # 驗證日期範圍
            validate_date_range(main_df, logger)
                
        except Exception as e:
            logger.error(f"讀取股票主檔失敗: {str(e)}")
            return False

        # 過濾日期範圍
        if date_range:
            start_date, end_date = date_range
            main_df = main_df[
                (main_df['日期'] >= start_date) & 
                (main_df['日期'] <= end_date)
            ]
        
        # 初始化特徵生成器
        generator = FeatureGenerator(config)
        
        # 分組處理每支股票
        unique_stocks = sorted(main_df['證券代號'].unique())
        stock_batches = [
            unique_stocks[i:i + BATCH_SIZE] 
            for i in range(0, len(unique_stocks), BATCH_SIZE)
        ]
        
        all_results = []
        total_stocks = len(unique_stocks)
        processed_stocks = 0
        
        # 使用tqdm進度條處理每個批次
        with tqdm(stock_batches, desc="特徵生成進度") as pbar:
            for batch in pbar:
                batch_results = []
                
                # 處理批次中的每支股票
                for stock_id in batch:
                    try:
                        # 只選取當前股票的資料
                        stock_df = main_df[main_df['證券代號'] == stock_id].copy()
                        
                        # 生成特徵
                        result_df = generator.generate_features(stock_df, stock_id)
                        
                        if result_df is not None and not result_df.empty:
                            batch_results.append(result_df)
                            processed_stocks += 1
                            
                    except Exception as e:
                        logger.error(f"處理股票 {stock_id} 時發生錯誤: {str(e)}")
                        continue
                        
                    finally:
                        # 檢查記憶體使用
                        current_memory = psutil.Process().memory_info().rss / (1024 * 1024 * 1024)
                        if current_memory > MEMORY_THRESHOLD:
                            logger.warning(f"記憶體使用量: {current_memory:.2f}GB，執行清理...")
                            gc.collect()
                
                # 合併批次結果
                if batch_results:
                    batch_df = pd.concat(batch_results, ignore_index=True)
                    all_results.append(batch_df)
                    
                # 更新進度條
                pbar.set_postfix({
                    '成功': f"{processed_stocks}/{total_stocks}",
                    '比例': f"{(processed_stocks/total_stocks)*100:.1f}%",
                    '記憶體': f"{psutil.Process().memory_info().rss/1024/1024/1024:.1f}GB"
                })
                
                # 清理當前批次的記憶體
                del batch_results
                gc.collect()
        
        # 合併並保存最終結果
        if all_results:
            try:
                logger.info("合併處理結果...")
                final_df = pd.concat(all_results, ignore_index=True)
                
                # 保存前的最終驗證
                logger.info("執行最終驗證...")
                if len(final_df) > 0 and len(final_df.columns) >= len(main_df.columns):
                    # 保存結果
                    output_file = config.get_enhanced_features_path()
                    final_df.to_csv(output_file, index=False, encoding='utf-8-sig')
                    
                    # 生成報告
                    generate_report(final_df, total_stocks, processed_stocks, config)
                    
                    logger.info(
                        f"特徵生成完成\n"
                        f"- 處理成功率: {(processed_stocks/total_stocks)*100:.2f}%\n"
                        f"- 總資料筆數: {len(final_df):,}\n"
                        f"- 特徵數量: {len(final_df.columns)}"
                    )
                    return True
                else:
                    logger.error("最終驗證失敗: 結果資料不完整")
                    return False
                    
            except Exception as e:
                logger.error(f"保存結果時發生錯誤: {str(e)}")
                return False
                
        logger.error("沒有成功處理的數據")
        return False
        
    except Exception as e:
        logger.error(f"特徵生成過程發生錯誤: {str(e)}")
        return False

def validate_feature_result(df: pd.DataFrame, stock_id: str) -> bool:
    """驗證特徵生成結果
    
    Args:
        df: 特徵結果DataFrame
        stock_id: 股票代碼
        
    Returns:
        bool: 驗證是否通過
    """
    try:
        # 1. 檢查基本欄位
        required_columns = [
            '證券代號', '日期', '開盤價', '最高價', '最低價', 
            '收盤價', '成交股數', '成交金額'
        ]
        if not all(col in df.columns for col in required_columns):
            return False
            
        # 2. 檢查數值欄位
        numeric_columns = [
            col for col in df.columns 
            if col not in ['證券代號', '證券名稱', '日期']
        ]
        
        for col in numeric_columns:
            # 檢查是否全為空值
            if df[col].isna().all():
                return False
                
            # 檢查是否包含無限值
            if np.isinf(df[col].replace([np.inf, -np.inf], np.nan)).any():
                return False
        
        # 3. 檢查時間連續性
        dates = pd.to_datetime(df['日期'])
        date_diff = dates.diff().dropna()
        if len(date_diff.unique()) > 3:  # 允許最多3種不同的時間間隔
            return False
            
        return True
        
    except Exception:
        return False

def generate_report(df: pd.DataFrame, total_stocks: int, processed_stocks: int, config: FeatureConfig):
    """生成處理報告"""
    logger = FeatureLogger(config)
    
    try:
        report_path = config.meta_data_path / 'feature_generation_report.txt'
        
        with open(report_path, 'w', encoding='utf-8') as f:
            # 基本資訊
            f.write("特徵生成處理報告\n")
            f.write(f"生成時間: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            # 處理統計
            f.write(f"總股票數: {total_stocks}\n")
            f.write(f"成功處理: {processed_stocks}\n")
            f.write(f"處理比率: {(processed_stocks/total_stocks)*100:.2f}%\n\n")
            
            # 特徵統計
            f.write(f"特徵數量: {len(df.columns)}\n")
            f.write("\n特徵列表:\n")
            
            # 分類特徵列表
            feature_categories = {
                "基本資料": ['證券代號', '證券名稱', '日期', '開盤價', '最高價', '最低價', '收盤價', '成交股數'],
                "量能特徵": ['量比', '量增率', '量能趨勢', '量能波動', '量價背離'],
                "波動特徵": ['日內波動率', '振幅', '漲跌幅', '波動率趨勢'],
                "趨勢特徵": ['趨勢強度', '通道寬度變化', '趨勢動能', '趨勢持續性'],
                "技術特徵": ['RSI', 'MACD', 'KD_差值', '均線糾結度', 'RSI_動能', 'MACD_動能', '技術綜合評分'],
                "產業基本特徵": [col for col in df.columns if '產業_' in col],
                "產業風險特徵": [
                    col for col in df.columns 
                    if any(x in col for x in ['VaR', '波動率', '回撤', 'risk'])
                ],
                "產業動能特徵": [
                    col for col in df.columns 
                    if any(x in col for x in ['動能', '趨勢', '強度'])
                ],
                "相對表現特徵": [
                    col for col in df.columns 
                    if any(x in col for x in ['相對', '比較', '調整'])
                ]
            }
            
            for category, features in feature_categories.items():
                f.write(f"\n{category}:\n")
                present_features = [feat for feat in features if feat in df.columns]
                if present_features:
                    for feature in present_features:
                        # 計算非空值比例
                        non_null_ratio = (1 - df[feature].isnull().mean()) * 100
                        f.write(f"  - {feature} (有效數據: {non_null_ratio:.1f}%)\n")
                else:
                    f.write("  (無特徵)\n")
            
            # 產業覆蓋率分析
            f.write("\n產業覆蓋分析:\n")
            try:
                industry_columns = [col for col in df.columns if '產業_' in col]
                if industry_columns:
                    industries_covered = len(set([col.split('_')[1] for col in industry_columns]))
                    f.write(f"- 覆蓋產業數: {industries_covered}\n")
                    f.write("- 產業特徵示例:\n")
                    for ind in sorted(list(set([col.split('_')[1] for col in industry_columns])))[:5]:
                        f.write(f"  * {ind}\n")
                else:
                    f.write("- 未生成產業特徵\n")
            except Exception as e:
                f.write(f"- 產業分析錯誤: {str(e)}\n")
            
            # 數據範圍資訊
            f.write(f"\n數據時間範圍: {df['日期'].min()} 到 {df['日期'].max()}\n")
            
            # 特徵品質統計
            f.write("\n特徵品質統計:\n")
            null_stats = df.isnull().mean() * 100
            f.write(f"- 平均缺失率: {null_stats.mean():.2f}%\n")
            if null_stats.max() > 50:
                f.write("- 缺失率超過50%的特徵:\n")
                high_null_features = null_stats[null_stats > 50].index.tolist()
                for feat in high_null_features:
                    f.write(f"  * {feat}: {null_stats[feat]:.1f}%\n")
            
        logger.info(f"處理報告已保存至: {report_path}")
        
    except Exception as e:
        logger.error(f"生成報告時發生錯誤: {str(e)}")

def verify_features(config: FeatureConfig):
    """驗證特徵生成結果"""
    try:
        # 初始化 logger
        logger = FeatureLogger(config)  # 新增這行
        
        # 讀取特徵檔案
        df = pd.read_csv(config.get_enhanced_features_path())
        
        # 1. 基本檢查
        logger.info("========== 數據基本信息 ==========")
        logger.info(f"總數據量: {len(df):,}")  # 修改這裡
        logger.info(f"唯一股票數: {len(df['證券代號'].unique()):,}")
        
        # 2. 檢查特徵類別
        feature_groups = {
            "基本特徵": ['開盤價', '最高價', '最低價', '收盤價', '成交股數', '成交金額', '成交筆數'],
            "量能特徵": ['量比', '量增率', '量能趨勢'],
            "波動特徵": ['日內波動率', '振幅', '漲跌幅', '波動率趨勢'],
            "趨勢特徵": ['趨勢強度', '通道寬度', '通道寬度變化', '趨勢動能', '趨勢持續性'],
            "技術特徵": ['均線糾結度', 'RSI_動能', 'MACD_動能', '技術綜合評分', 'RSI', 'MACD', 'KD_差值'],
            "產業特徵": ['產業_報酬率', '產業_波動率', '產業_強度排名', '產業_動能得分'],
            "相對表現": ['相對技術評分', '相對動能', '風險調整相對表現']
        }
        
        logger.info("\n========== 特徵檢查結果 ==========")
        for group_name, features in feature_groups.items():
            present = [f for f in features if f in df.columns]
            missing = [f for f in features if f not in df.columns]
            
            logger.info(f"\n{group_name}:")
            if present:
                logger.info("已生成特徵:")
                for feat in present:
                    # 計算非空值比例
                    non_null_ratio = (1 - df[feat].isnull().mean()) * 100
                    logger.info(f"  - {feat} (有效數據: {non_null_ratio:.1f}%)")
            
            if missing:
                logger.info("缺失特徵:")
                for feat in missing:
                    logger.info(f"  - {feat}")
        
        # 3. 產業特徵特別檢查
        industry_features = [col for col in df.columns if '產業_' in col]
        if industry_features:
            logger.info("\n========== 產業特徵詳情 ==========")
            for feat in industry_features:
                non_null_ratio = (1 - df[feat].isnull().mean()) * 100
                logger.info(f"- {feat} (有效數據: {non_null_ratio:.1f}%)")
        else:
            logger.warning("\n警告: 未發現任何產業相關特徵")
        
        return True
        
    except Exception as e:
        logger.error(f"驗證過程出錯: {str(e)}")
        return False

In [None]:
if __name__ == "__main__":
    # 設定更詳細的日誌層級
    logging.basicConfig(level=logging.DEBUG)
    
    # 執行主程式
    success = main()
    
    if not success:
        print("程式執行失敗，請檢查日誌了解詳細錯誤信息")
    else:
        print("程式執行成功完成")

In [None]:
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional
import logging

class IndustryContentValidator:
    """產業分析檔案內容驗證器"""
    
    def __init__(self, base_dir: str = "D:/Min/Python/Project/FA_Data"):
        self.base_dir = Path(base_dir)
        self.required_fields = {
            "basic_info": ["industry_name", "period", "stocks"],
            "time_series_analysis": ["trend", "seasonality", "lead_lag"],
            "risk_analysis": ["ratios", "downside", "tail_risk"],
            "rotation_analysis": ["strength_ranking", "momentum_ranking", "flow_analysis"]
        }
        
    def validate_file_content(self, file_path: Path) -> Dict:
        """驗證單一檔案的內容"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = json.load(f)
                
            validation_result = {
                "file_name": file_path.name,
                "is_valid": True,
                "missing_fields": [],
                "data_quality": {},
                "warnings": []
            }
            
            # 檢查必要欄位
            for section, fields in self.required_fields.items():
                if section not in content:
                    validation_result["missing_fields"].append(section)
                    validation_result["is_valid"] = False
                    continue
                    
                for field in fields:
                    if field not in content[section]:
                        validation_result["missing_fields"].append(f"{section}.{field}")
                        validation_result["is_valid"] = False
            
            # 檢查數據品質
            if validation_result["is_valid"]:
                try:
                    # 1. 檢查時間序列分析
                    if "trend" in content["time_series_analysis"]:
                        trend_data = content["time_series_analysis"]["trend"]
                        validation_result["data_quality"]["trend"] = {
                            "has_direction": "trend_direction" in trend_data,
                            "has_strength": "trend_strength" in trend_data,
                            "has_metrics": all(k in trend_data for k in ["slope", "r2_score"])
                        }
                    
                    # 2. 檢查風險分析
                    if "ratios" in content["risk_analysis"]:
                        ratios = content["risk_analysis"]["ratios"]
                        validation_result["data_quality"]["risk_metrics"] = {
                            "has_returns": "annual_return" in ratios,
                            "has_volatility": "annual_volatility" in ratios,
                            "has_sharpe": "sharpe_ratio" in ratios
                        }
                    
                    # 3. 檢查數值的有效性
                    self._validate_numeric_values(content, validation_result)
                except Exception as e:
                    validation_result["warnings"].append(f"數據品質檢查錯誤: {str(e)}")
            
            return validation_result
        except Exception as e:
            return {
                "file_name": file_path.name,
                "is_valid": False,
                "error": str(e),
                "warnings": ["檔案讀取或解析失敗"]
            }
            
    def _validate_numeric_values(self, content: Dict, result: Dict) -> None:
        """驗證數值的有效性"""
        try:
            if "risk_analysis" in content and "ratios" in content["risk_analysis"]:
                ratios = content["risk_analysis"]["ratios"]
                
                # 檢查年化波動率
                if "annual_volatility" in ratios:
                    try:
                        vol = float(ratios["annual_volatility"])
                        if vol < 0 or vol > 1:
                            result["warnings"].append(f"年化波動率異常: {vol}")
                    except (ValueError, TypeError):
                        result["warnings"].append("年化波動率格式無效")
                
                # 檢查夏普比率
                if "sharpe_ratio" in ratios:
                    try:
                        sharpe = float(ratios["sharpe_ratio"])
                        if abs(sharpe) > 10:
                            result["warnings"].append(f"夏普比率異常: {sharpe}")
                    except (ValueError, TypeError):
                        result["warnings"].append("夏普比率格式無效")
                
                # 檢查年化報酬率
                if "annual_return" in ratios:
                    try:
                        ret = float(ratios["annual_return"])
                        if abs(ret) > 1:  # 超過100%的報酬率
                            result["warnings"].append(f"年化報酬率異常: {ret}")
                    except (ValueError, TypeError):
                        result["warnings"].append("年化報酬率格式無效")
        except Exception as e:
            result["warnings"].append(f"數值驗證過程錯誤: {str(e)}")
    
    def analyze_all_files(self) -> Dict:
        """分析所有產業分析檔案"""
        results = {
            "valid_files": 0,
            "invalid_files": 0,
            "total_files": 0,
            "file_details": [],
            "common_issues": {}
        }
        
        # 檢查 return_index 和 price_index 目錄
        for subdir in ["return_index", "price_index"]:
            dir_path = self.base_dir / "industry_analysis" / subdir
            if not dir_path.exists():
                continue
                
            for file_path in dir_path.glob("*.json"):
                results["total_files"] += 1
                validation_result = self.validate_file_content(file_path)
                
                if validation_result.get("is_valid", False):
                    results["valid_files"] += 1
                else:
                    results["invalid_files"] += 1
                    
                results["file_details"].append(validation_result)
                
                # 統計常見問題
                for warning in validation_result.get("warnings", []):
                    if warning not in results["common_issues"]:
                        results["common_issues"][warning] = 0
                    results["common_issues"][warning] += 1
        
        return results

    def print_analysis_report(self):
        """列印分析報告"""
        results = self.analyze_all_files()
        
        print("\n產業分析檔案內容驗證報告")
        print("=" * 50)
        print(f"檔案統計:")
        print(f"- 總檔案數: {results['total_files']}")
        print(f"- 有效檔案: {results['valid_files']}")
        print(f"- 無效檔案: {results['invalid_files']}")
        
        if results['invalid_files'] > 0:
            print("\n無效檔案詳情:")
            for detail in results['file_details']:
                if not detail.get('is_valid', False):
                    print(f"\n檔案: {detail['file_name']}")
                    if 'missing_fields' in detail:
                        print(f"缺失欄位: {', '.join(detail['missing_fields'])}")
                    if 'error' in detail:
                        print(f"錯誤訊息: {detail['error']}")
        
        if results['common_issues']:
            print("\n常見問題:")
            for issue, count in results['common_issues'].items():
                print(f"- {issue} (發生 {count} 次)")
        
        print("\n數據品質檢查:")
        for detail in results['file_details']:
            if 'data_quality' in detail:
                print(f"\n{detail['file_name']}:")
                if 'trend' in detail['data_quality']:
                    trend = detail['data_quality']['trend']
                    print("  趨勢分析:")
                    print(f"  - 趨勢方向: {'有' if trend['has_direction'] else '無'}")
                    print(f"  - 趨勢強度: {'有' if trend['has_strength'] else '無'}")
                    print(f"  - 趨勢指標: {'完整' if trend['has_metrics'] else '不完整'}")
                
                if 'risk_metrics' in detail['data_quality']:
                    risk = detail['data_quality']['risk_metrics']
                    print("  風險指標:")
                    print(f"  - 報酬率: {'有' if risk['has_returns'] else '無'}")
                    print(f"  - 波動率: {'有' if risk['has_volatility'] else '無'}")
                    print(f"  - 夏普比率: {'有' if risk['has_sharpe'] else '無'}")

# 使用範例
if __name__ == "__main__":
    validator = IndustryContentValidator()
    validator.print_analysis_report()