In [23]:
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
import logging
from pathlib import Path
from datetime import datetime, timedelta
from enum import Enum

In [24]:
class IndexType(Enum):
    """指數類型"""
    PRICE = "價格指數"    # 不含股利
    RETURN = "報酬指數"   # 含股利再投資

class TimeFrequency(Enum):
    """時間頻率"""
    WEEKLY = 'W'
    MONTHLY = 'ME'

In [25]:
class IndustryAnalysisSystem:
    """產業分析系統"""
    
    def __init__(self, base_path: str = "D:/Min/Python/Project/FA_Data"):
        self.base_path = Path(base_path)
        self.logger = logging.getLogger(self.__class__.__name__)
        
        # 建立必要的目錄結構
        self._initialize_directories()
        
        # 載入基礎資料
        self.company_data = pd.read_csv(self.base_path / "meta_data/companies.csv")
        self.industry_index = pd.read_csv(self.base_path / "meta_data/industry_index.csv")
        self.market_index = pd.read_csv(self.base_path / "meta_data/market_index.csv")
        
        # 建立產業對應關係
        self.industry_mapping = self._create_industry_mapping()
        
    def _initialize_directories(self):
        """初始化目錄結構"""
        # 建立主要目錄
        (self.base_path / "industry_correlation/weekly").mkdir(parents=True, exist_ok=True)
        (self.base_path / "industry_correlation/monthly").mkdir(parents=True, exist_ok=True)
        (self.base_path / "industry_analysis/price_index").mkdir(parents=True, exist_ok=True)
        (self.base_path / "industry_analysis/return_index").mkdir(parents=True, exist_ok=True)
        
    def _create_industry_mapping(self) -> Dict:
        """建立產業分類對應關係"""
        mapping = {}
        
        # 建立指數類型分類
        index_types = {
            name: IndexType.RETURN if '報酬指數' in name else IndexType.PRICE
            for name in self.industry_index['指數名稱'].unique()
        }
        
        # 產業名稱標準化
        def get_base_name(name: str) -> str:
            """獲取基礎產業名稱"""
            return name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip()
        
        # 建立產業分類到指數的映射
        for index_name in self.industry_index['指數名稱'].unique():
            base_name = get_base_name(index_name)
            if base_name not in mapping:
                mapping[base_name] = {
                    IndexType.PRICE: [],
                    IndexType.RETURN: [],
                    'categories': []
                }
            
            # 根據指數類型分類
            index_type = index_types[index_name]
            mapping[base_name][index_type].append(index_name)
            
        # 添加產業類別
        for category in self.company_data['industry_category'].unique():
            base_name = get_base_name(category)
            if base_name in mapping:
                mapping[base_name]['categories'].append(category)
                
        return mapping
    
    def get_industry_stocks(self, industry_name: str) -> List[str]:
        """獲取特定產業的所有股票"""
        base_name = industry_name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip()
        
        if base_name in self.industry_mapping:
            categories = self.industry_mapping[base_name]['categories']
            stocks = self.company_data[
                self.company_data['industry_category'].isin(categories)
            ]['stock_id'].unique().tolist()
            return stocks
        return []
    
    def get_industry_performance(self, industry_name: str, 
                               index_type: IndexType = None,
                               start_date: str = None, 
                               end_date: str = None) -> pd.DataFrame:
        """獲取產業指數表現"""
        base_name = industry_name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip()
        
        if base_name in self.industry_mapping:
            # 根據指數類型選擇指數
            if index_type is None:
                # 如果未指定類型，優先使用報酬指數
                index_names = (self.industry_mapping[base_name][IndexType.RETURN] or 
                             self.industry_mapping[base_name][IndexType.PRICE])
            else:
                index_names = self.industry_mapping[base_name][index_type]
                
            if not index_names:
                return pd.DataFrame()
            
            # 獲取產業指數數據
            industry_data = self.industry_index[
                self.industry_index['指數名稱'].isin(index_names)
            ].copy()
            
            # 日期過濾
            industry_data['日期'] = pd.to_datetime(industry_data['日期'])
            if start_date:
                industry_data = industry_data[industry_data['日期'] >= pd.to_datetime(start_date)]
            if end_date:
                industry_data = industry_data[industry_data['日期'] <= pd.to_datetime(end_date)]
                
            # 計算相關指標
            industry_data['daily_return'] = industry_data.groupby('指數名稱')['收盤指數'].pct_change()
            industry_data['volatility'] = industry_data.groupby('指數名稱')['daily_return'].rolling(20).std().values
            
            return industry_data
        return pd.DataFrame()
    
    def calculate_correlation_matrix(self, 
                                  data: pd.DataFrame, 
                                  date_column: str = '日期',
                                  value_column: str = '收盤指數') -> pd.DataFrame:
        """計算相關性矩陣"""
        # 將數據轉換為寬格式
        pivot_data = data.pivot(
            index=date_column,
            columns='指數名稱',
            values=value_column
        )
        
        # 計算相關性矩陣
        corr_matrix = pivot_data.corr()
        
        # 標準化產業名稱
        corr_matrix.index = [name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip() 
                            for name in corr_matrix.index]
        corr_matrix.columns = [name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip() 
                             for name in corr_matrix.columns]
        
        return corr_matrix
    
    def generate_correlation_series(self, 
                                 frequency: TimeFrequency,
                                 start_date: str = None,
                                 end_date: str = None):
        """生成並儲存時間序列相關性數據"""
        # 準備數據
        data = self.industry_index.copy()
        data['日期'] = pd.to_datetime(data['日期'])
        
        if start_date:
            data = data[data['日期'] >= pd.to_datetime(start_date)]
        if end_date:
            data = data[data['日期'] <= pd.to_datetime(end_date)]
            
        # 根據頻率重採樣
        if frequency == TimeFrequency.WEEKLY:
            grouped = data.groupby([pd.Grouper(key='日期', freq='W'), '指數名稱'])
            folder = "weekly"
        else:  # MONTHLY
            grouped = data.groupby([pd.Grouper(key='日期', freq='ME'), '指數名稱'])
            folder = "monthly"
            
        # 計算每個時間點的收盤價
        period_data = grouped['收盤指數'].last().reset_index()
        
        # 按時間點計算相關性矩陣
        for date, group in period_data.groupby('日期'):
            if len(group) > 1:  # 確保有足夠的數據計算相關性
                corr_matrix = self.calculate_correlation_matrix(group)
                
                # 儲存相關性矩陣
                filename = f"correlation_{date.strftime('%Y%m%d')}.csv"
                save_path = self.base_path / "industry_correlation" / folder / filename
                corr_matrix.to_csv(save_path)
                
                self.logger.info(f"已生成並儲存{folder}相關性矩陣: {filename}")
                
    def analyze_index_difference(self, 
                               industry_name: str,
                               start_date: str = None,
                               end_date: str = None) -> Dict:
        """分析價格指數和報酬指數的差異"""
        price_data = self.get_industry_performance(
            industry_name, 
            index_type=IndexType.PRICE,
            start_date=start_date,
            end_date=end_date
        )
        
        return_data = self.get_industry_performance(
            industry_name,
            index_type=IndexType.RETURN,
            start_date=start_date,
            end_date=end_date
        )
        
        result = {
            'industry': industry_name,
            'price_index': {},
            'return_index': {},
            'difference': {}
        }
        
        if not price_data.empty:
            result['price_index'] = {
                'return': price_data['daily_return'].mean() * 252,
                'volatility': price_data['volatility'].mean() * np.sqrt(252)
            }
            
        if not return_data.empty:
            result['return_index'] = {
                'return': return_data['daily_return'].mean() * 252,
                'volatility': return_data['volatility'].mean() * np.sqrt(252)
            }
            
        # 計算差異
        if result['price_index'] and result['return_index']:
            result['difference'] = {
                'return_diff': result['return_index']['return'] - result['price_index']['return'],
                'volatility_diff': result['return_index']['volatility'] - result['price_index']['volatility']
            }
            
        return result
    
    def generate_industry_report(self, industry_name: str, 
                               start_date: str = None, 
                               end_date: str = None) -> Dict:
        """生成完整的產業分析報告"""
        report = {
            'industry_name': industry_name,
            'period': {'start': start_date, 'end': end_date},
            'stocks': self.get_industry_stocks(industry_name),
            'index_analysis': self.analyze_index_difference(industry_name, start_date, end_date),
            'correlations': {}
        }
        
        # 獲取相關性數據
        performance_data = self.get_industry_performance(industry_name, start_date=start_date, end_date=end_date)
        if not performance_data.empty:
            # 計算與其他產業的相關性
            all_data = self.industry_index.copy()
            all_data['日期'] = pd.to_datetime(all_data['日期'])
            
            # 優先使用報酬指數
            all_data['base_industry'] = all_data['指數名稱'].apply(
                lambda x: x.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip()
            )
            # 標記是否為報酬指數
            all_data['is_return_index'] = all_data['指數名稱'].str.contains('報酬指數')
            
            # 對於每個產業，優先選擇報酬指數，如果沒有則使用價格指數
            priority_indices = (all_data.sort_values('is_return_index', ascending=False)
                              .groupby('base_industry').first()
                              .reset_index())
            
            # 只使用選定的指數進行相關性分析
            all_data = all_data[all_data['指數名稱'].isin(priority_indices['指數名稱'])]
            
            if start_date:
                all_data = all_data[all_data['日期'] >= pd.to_datetime(start_date)]
            if end_date:
                all_data = all_data[all_data['日期'] <= pd.to_datetime(end_date)]
            
            # 計算相關性矩陣
            corr_matrix = self.calculate_correlation_matrix(all_data)
            base_name = industry_name.replace('類報酬指數', '').replace('類指數', '').replace('業', '').strip()
            
            if base_name in corr_matrix.index:
                # 獲取相關性並排序
                correlations = corr_matrix.loc[base_name]
                if isinstance(correlations, pd.Series):
                    # 移除自身的相關性
                    correlations = correlations[correlations.index != base_name]
                    # 排序
                    sorted_correlations = correlations.sort_values(ascending=False)
                    report['correlations'] = sorted_correlations.to_dict()
                else:
                    # 如果是DataFrame，取第一行
                    correlations = correlations.iloc[0]
                    correlations = correlations[correlations.index != base_name]
                    sorted_correlations = correlations.sort_values(ascending=False)
                    report['correlations'] = sorted_correlations.to_dict()
        
        return report

In [26]:
if __name__ == "__main__":
    # 初始化系統
    analyzer = IndustryAnalysisSystem()
    
    # 生成週度和月度相關性數據
    analyzer.generate_correlation_series(
        TimeFrequency.WEEKLY,
        start_date='2023-01-01',
        end_date='2024-11-12'
    )
    
    analyzer.generate_correlation_series(
        TimeFrequency.MONTHLY,
        start_date='2023-01-01',
        end_date='2024-11-12'
    )
    
    # 分析光電產業
    report = analyzer.generate_industry_report(
        '光電',
        start_date='2023-01-01',
        end_date='2024-11-12'
    )
    
    print("\n產業分析報告：")
    print(f"產業：{report['industry_name']}")
    print(f"分析期間：{report['period']['start']} 到 {report['period']['end']}")
    print(f"股票數量：{len(report['stocks'])}")
    
    print("\n指數分析：")
    for index_type, metrics in report['index_analysis'].items():
        if isinstance(metrics, dict) and metrics:
            print(f"\n{index_type}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")
    
    print("\n相關性最高的前5個產業：")
    top_5_corr = dict(sorted(report['correlations'].items(), key=lambda x: abs(x[1]), reverse=True)[:5])
    for ind, corr in top_5_corr.items():
        print(f"{ind}: {corr:.4f}")


產業分析報告：
產業：光電
分析期間：2023-01-01 到 2024-11-12
股票數量：146

指數分析：

price_index:
  return: 0.1300
  volatility: 0.1803

return_index:
  return: 0.1603
  volatility: 0.1792

difference:
  return_diff: 0.0303
  volatility_diff: -0.0010

相關性最高的前5個產業：
綠能環保: 0.9263
化學生技醫療: 0.9236
其他電子: 0.8958
航運: 0.8911
電子零組件: 0.8666
