In [1]:
from smartmoneyconcepts import smc
import pandas as pd
import statsmodels.api as sm
from datetime import datetime, timedelta, time
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from scipy.fft import fft, fftfreq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import mplfinance as mpf
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.stattools import adfuller, coint
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from collections import defaultdict
import math
from scipy import stats
from arch.unitroot import KPSS
from hurst import compute_Hc
from statsmodels.tsa.vector_ar.vecm import VECM
import seaborn as sns
from scipy.integrate import quad
from statsmodels.stats.diagnostic import acorr_ljungbox
from scipy.spatial.distance import cityblock, euclidean, cosine
from scipy.stats import gaussian_kde, shapiro, probplot, skew, kurtosis, norm, jarque_bera, anderson, normaltest
from scipy.special import logit, expit

def prepare_data(type_of_data, data_name):
    result = type_of_data.split('/')[0]
    tmp = pd.read_csv(f'../index_data/{type_of_data}/{data_name}.csv')
    if result == 'shioaji':
        tmp['ts'] = pd.to_datetime(tmp['ts'])
        tmp = tmp.rename(columns=lambda x: x.lower())
    else:
        tmp['ts'] = pd.to_datetime(tmp['datetime'])
        tmp = tmp.rename(columns=lambda x: x.lower())
    
    return tmp

def aggregate_pv_list(pv_lists):
    if pv_lists.isna().all():
        return np.nan
    
    combined_volume = defaultdict(float)
    for pv in pv_lists.dropna():
        if isinstance(pv, list):  # 確保是 list
            for item in pv:
                if isinstance(item, dict):
                    for price, vol in item.items():
                        combined_volume[float(price)] += float(vol)  # 轉 float 避免類型問題

    # 按價格降序排序並轉為 list of dict
    aggregated_pv = [{price: combined_volume[price]} for price in sorted(combined_volume, reverse=True)]
    return aggregated_pv

def analyze_tick_types(tick_type_series, volume_series):
    """
    分析該秒內的成交類型分布。
    外盤成交量為正數，內盤成交量為負數。
    """
    # 將每一筆的 tick_type (1 或 -1) 和 volume 相乘
    signed_volumes = [t * v for t, v in zip(tick_type_series, volume_series)]

    # 外盤成交量 = 所有正數的總和
    outer_vol = sum(vol for vol in signed_volumes if vol > 0)

    # 內盤成交量 = 所有負數的總和 (不取絕對值)
    inner_vol = sum(vol for vol in signed_volumes if vol < 0)

    return outer_vol + inner_vol

def calculate_liquidity_factors(bid_list, ask_list, bid_vol_list, ask_vol_list):
    try:
        # 1. 計算平均買賣價差 (Average Bid-Ask Spread)
        spreads = [a - b for a, b in zip(ask_list, bid_list)]
        avg_spread = np.mean(spreads) if spreads else np.nan

        # 2. 計算委託簿失衡 (Order Book Imbalance, OBI)
        obi_list = [
            bv / (bv + av) if (bv + av) > 0 else 0.5 
            for bv, av in zip(bid_vol_list, ask_vol_list)
        ]
        # 計算這一秒內所有 tick OBI 的平均值
        avg_obi = np.mean(obi_list) if obi_list else np.nan
        
        return pd.Series({
            'avg_spread': avg_spread,
            'avg_obi': avg_obi
        })
        
    except (ValueError, TypeError):
        return pd.Series({
            'avg_spread': np.nan,
            'avg_obi': np.nan
        })

def calculate_price_factors(close_list, volume_list):
    """
    計算秒級價格因子。
    """
    try:
        sum_price_volume = sum((float(c)) * float(v) for c, v in zip(close_list, volume_list))
        sum_price_sq_volume = sum((float(c) ** 2) * float(v) for c, v in zip(close_list, volume_list))
        
        # 計算 pv_list：按 close 價格聚合成交量
        volume_by_close = defaultdict(float)
        for close, volume in zip(close_list, volume_list):
            volume_by_close[close] += float(volume)
        
        # 轉為 pv_list 格式並按 close 價格降序排序
        pv_list = [{close: volume} for close, volume in sorted(volume_by_close.items(), key=lambda x: x[0], reverse=True)]
        
        return pd.Series({
            'sum_price_volume': sum_price_volume,
            'sum_price_sq_volume': sum_price_sq_volume,
            'pv_list': pv_list
        })
    except (ValueError, ZeroDivisionError, TypeError, IndexError):
        return pd.Series({
            'sum_price_volume': np.nan,
            'sum_price_sq_volume': np.nan,
            'pv_list': np.nan
        })

def calculate_capital_factors(bid_list, ask_list, bid_vol_list, ask_vol_list):
    """
    計算秒級資金因子。
    """
    try:
        # 計算名義金額
        notional_bids = [p * v for p, v in zip(bid_list, bid_vol_list)]
        notional_asks = [p * v for p, v in zip(ask_list, ask_vol_list)]
        
        # 計算淨資金流向
        monetary_delta = sum(notional_bids) - sum(notional_asks)
        volume_delta = sum(bid_vol_list) - sum(ask_vol_list)

        return pd.Series({
            'monetary_delta': monetary_delta,  # CVD 聚合用
            'volume_delta': volume_delta,      # 可選，純數量 delta
        })
    
    except (ValueError, ZeroDivisionError, TypeError):
        return pd.Series({
            'monetary_delta': np.nan,
            'volume_delta': np.nan,
        })

def pre_process(df1, data_type, date_start, date_end):
    # 假设你的df已经加载到dataframe
    df1['ts'] = pd.to_datetime(df1['ts'])  # 将ts列转换为datetime类型

    # 按秒分组，使用 named aggregation
    df1 = df1.groupby(df1['ts'].dt.floor('s')).agg(
        close=('close', 'last'),
        volume=('volume', 'sum'), # 總成交量
        close_list=('close', list),
        volume_list=('volume', list), # 成交量列表
        bid_price=('bid_price', lambda x: tuple(sorted(filter(lambda price: price != 0, x)))),
        ask_price=('ask_price', lambda x: tuple(sorted(filter(lambda price: price != 0, x)))),
        bid_list=('bid_price', list),
        ask_list=('ask_price', list),
        bid_vol_list=('bid_volume', list),
        ask_vol_list=('ask_volume', list),
        tick_type=('tick_type', lambda x: [1 if t == 1 else -1 if t == 2 else 0 for t in x])
    ).reset_index()
    
    # 分析每秒的內外盤成交量
    df1['flow_imbalance'] = df1.apply(lambda row: analyze_tick_types(row['tick_type'], row['volume_list']), axis=1)
    liquidity_features = df1.apply(lambda row: calculate_liquidity_factors(row['bid_list'], row['ask_list'], row['bid_vol_list'], row['ask_vol_list']), axis=1)
    df1 = pd.concat([df1, liquidity_features], axis=1)
    
    price_features = df1.apply(lambda row: calculate_price_factors(row['close_list'], row['volume_list']), axis=1)
    df1 = pd.concat([df1, price_features], axis=1)
    
    capital_features = df1.apply(lambda row: calculate_capital_factors(row['bid_list'], row['ask_list'], row['bid_vol_list'], row['ask_vol_list']), axis=1)
    df1 = pd.concat([df1, capital_features], axis=1)

    # 刪除 tick_type 原始資料
    df1.drop(columns=['tick_type', 'volume_list', 'bid_list', 'ask_list', 'bid_vol_list', 'ask_vol_list', 'close_list'], inplace=True)

    # 创建时间范围从開始到結束天數（或多个天数）
    time_range = pd.date_range(date_start, date_end, freq='s')

    # 将时间范围转换为DataFrame
    full_time_df = pd.DataFrame(time_range, columns=['ts'])
    
    if data_type == 's_day':
        # 通过检查时间是否在9:00:00到13:30:00之间来剔除跨天的数据
        valid_time_range = full_time_df['ts'].dt.time.between(pd.to_datetime('09:00:00').time(), pd.to_datetime('13:30:00').time())
        valid_time = full_time_df[valid_time_range]
        
    elif data_type == 'f_day':
        # 通过检查时间是否在9:00:00到13:30:00之间来剔除跨天的数据
        valid_time_range = full_time_df['ts'].dt.time.between(pd.to_datetime('08:45:00').time(), pd.to_datetime('13:45:00').time())
        valid_time = full_time_df[valid_time_range]
        
    elif data_type == 'f_night':
        t1 = pd.to_datetime('08:45:00').time()
        t2 = pd.to_datetime('13:45:00').time()
        t3 = pd.to_datetime('15:00:00').time()
        t4 = pd.to_datetime('05:00:00').time()
        
        # 提取時間部分並進行向量化比較
        time_series = full_time_df['ts'].dt.time
        valid_time = full_time_df[
            ((time_series >= t1) & (time_series <= t2)) |  # 日盤時間
            ((time_series >= t3) | (time_series <= t4))   # 夜盤時間
        ]

    # 合并df1和df2的结果，确保它们与mer_ori_data按秒对齐, 首先将df1和df2与mer_ori_data合并，使用'left'连接方式，以保留所有有效时间
    mer_ori_data = pd.merge(valid_time, df1, on='ts', how='left')

    # 设置'ts'为index
    mer_ori_data.set_index('ts', inplace=True)
    mer_ori_data = mer_ori_data.dropna()
    
    extra_df = mer_ori_data.resample('1min').agg({
        'flow_imbalance': 'sum',
        'avg_spread': 'mean',
        'avg_obi': 'mean',
        'sum_price_volume': 'sum',
        'sum_price_sq_volume': 'sum',
        'monetary_delta': 'sum',
        'volume_delta': 'sum',
        'pv_list': aggregate_pv_list
    })
    
    # 為了配合合併分K, 需把時間+1分鐘
    extra_df.index = extra_df.index + pd.Timedelta(minutes=1)

    # 過濾掉 bid_price 或 ask_price 為空 tuple 的行 (漲停或跌停)
    mer_ori_data = mer_ori_data[(mer_ori_data['bid_price'].map(len) > 0) & (mer_ori_data['ask_price'].map(len) > 0)]
    
    return mer_ori_data.dropna(), extra_df.dropna()

def analyze_relationship(series1, series2, significance_level=0.05):
    # 計算相關係數
    correlation = np.corrcoef(series1, series2)[0, 1]
    abs_corr = abs(correlation)
    
    # 判斷相關性強度
    if abs_corr > 0.8:
        correlation_strength = "高度相關"
    elif abs_corr > 0.5:
        correlation_strength = "中度相關"
    elif abs_corr > 0.3:
        correlation_strength = "低度相關"
    else:
        correlation_strength = "幾乎無相關"
    
    # 執行共整合檢定
    coint_t, p_value, _ = coint(series1, series2)
    
    # 判斷是否存在共整合關係
    if p_value <= significance_level:
        cointegration_status = "存在共整合關係"
    else:
        cointegration_status = "不存在共整合關係"
        
    # print(f"相關係數: {correlation:.4f}，{correlation_strength}")
    # print(f"共整合檢定的 p 值: {p_value:.4f}，{cointegration_status}")
    
    return correlation, correlation_strength, p_value, cointegration_status

def convert_ohlcv(df, freq=60):
    # 建立 session_type 與 session_start
    t1 = datetime.strptime("08:45", "%H:%M").time()
    t2 = datetime.strptime("13:45", "%H:%M").time()
    t3 = datetime.strptime("05:00", "%H:%M").time()
    t4 = datetime.strptime("15:00", "%H:%M").time()
    
    def classify_session(timestamps):
        try:
            # 1. 使用 .dt.tz 來檢查時區信息
            timestamps = timestamps.dt.tz_localize(None) if timestamps.dt.tz is not None else timestamps
            
            # 2. 使用 .dt.time 和 .dt.date 來提取時間和日期部分
            times = timestamps.dt.time
            dates = timestamps.dt.date

            # 初始化結果
            session_type = pd.Series("other", index=timestamps.index)
            session_start = pd.Series(pd.NaT, index=timestamps.index)
        
            # 日盤條件
            day_mask = (times >= t1) & (times <= t2)
            session_type.loc[day_mask] = "day"
            session_start.loc[day_mask] = pd.to_datetime(
                dates[day_mask].astype(str) + " " + t1.strftime("%H:%M:%S")
            )
    
            # 夜盤條件 (當天夜盤)
            night_mask1 = (times >= t3) & ~day_mask
            session_type.loc[night_mask1] = "night"
            session_start.loc[night_mask1] = pd.to_datetime(
                dates[night_mask1].astype(str) + " " + t3.strftime("%H:%M:%S")
            )
    
            # 夜盤條件 (前一天夜盤，時間 <= t4)
            night_mask2 = (times <= t4) & ~day_mask & ~night_mask1
            session_type.loc[night_mask2] = "night"
            prev_dates = (timestamps[night_mask2] - timedelta(days=1)).dt.date
            session_start.loc[night_mask2] = pd.to_datetime(
                prev_dates.astype(str) + " " + t3.strftime("%H:%M:%S")
            )

            return pd.DataFrame({"session_type": session_type, "session_start": session_start})

        except Exception as e:
            print(f"時間轉換出現錯誤: {e}")
            return pd.DataFrame({"session_type": None, "session_start": None}, index=timestamps.index)
    
    result = classify_session(df.index.to_series())
    df.loc[:, ["session_type", "session_start"]] = result[["session_type", "session_start"]]
    df = df[df["session_type"].isin(["day", "night"])]

    # 新增4: 補齊缺失的1分K資料
    def fill_missing_minutes(df_session, session_start, session_type):
        # 定義交易時段範圍
        if session_type == "day":
            start_time = datetime.combine(session_start.date(), t1)
            end_time = datetime.combine(session_start.date(), t2)
        else:  # night
            start_time = datetime.combine(session_start.date(), t4)
            end_time = datetime.combine(session_start.date() + timedelta(days=1), t3)

        # 生成完整的1分鐘時間序列
        full_time_index = pd.date_range(start=start_time, end=end_time, freq="1min")
        existing_times = df_session.index

        # 找出缺失的時間點
        missing_times = [t for t in full_time_index if t not in existing_times]
        
        if missing_times:
            # 為每個缺失時間點填充資料
            missing_data = []
            last_valid_row = None
            for t in missing_times:
                # 找到前一筆有效資料
                prev_time = t - timedelta(minutes=1)
                if prev_time in df_session.index:
                    last_valid_row = df_session.loc[prev_time]
                if last_valid_row is not None:
                    missing_data.append({
                        "ts": t,
                        "open": last_valid_row["close"],
                        "high": last_valid_row["close"],
                        "low": last_valid_row["close"],
                        "close": last_valid_row["close"],
                        "volume": 0,
                        "amount": 0,
                        "complete": True,
                        "session_type": session_type,
                        "session_start": session_start
                    })

            # 將缺失資料合併到原資料
            if missing_data:
                missing_df = pd.DataFrame(missing_data).set_index("ts")
                df_session = pd.concat([df_session, missing_df]).sort_index()

        return df_session

    # 按 session 分組並補齊缺失資料
    df_filled = []
    for session_start, session_data in df.groupby("session_start"):
        session_type = session_data["session_type"].iloc[0]
        session_data = fill_missing_minutes(session_data, session_start, session_type)
        df_filled.append(session_data)

    if df_filled:
        df = pd.concat(df_filled).sort_index()

    df.index = df.index - pd.Timedelta(minutes=1)

    # 設定 K 棒時間長度
    window = timedelta(minutes=freq)

    # 分段處理每個 session 的資料
    result = []

    for session_start, session_data in df.groupby("session_start"):
        current_time = session_start
        max_time = session_data.index.max()
        
        session_result = []  # 临时存储当前 session 的 K 棒数据

        while current_time < max_time:
            next_time = current_time + window
            window_data = session_data[(session_data.index >= current_time) & (session_data.index < next_time)]

            if not window_data.empty:
                o = window_data["open"].iloc[0]
                h = window_data["high"].max()
                l = window_data["low"].min()
                c = window_data["close"].iloc[-1]
                v = window_data["volume"].sum()
                a = c * v
                complete = window_data.index[-1] >= next_time - timedelta(minutes=1)
                
                # 添加额外的列聚合
                flow_imbalance_agg = window_data['flow_imbalance'].sum()
                avg_spread_agg = window_data['avg_spread'].mean()
                avg_obi_agg = window_data['avg_obi'].mean()
                sum_price_volume = window_data['sum_price_volume'].sum()
                sum_price_sq_volume = window_data['sum_price_sq_volume'].sum()
                monetary_delta_agg = window_data['monetary_delta'].sum()
                volume_delta_agg = window_data['volume_delta'].sum()
                pv_list_agg = aggregate_pv_list(window_data['pv_list'])

                session_result.append({
                    "ts": current_time,
                    "open": o,
                    "high": h,
                    "low": l,
                    "close": c,
                    "volume": v,
                    "amount": a,
                    "flow_imbalance": flow_imbalance_agg,
                    "avg_spread": avg_spread_agg,
                    "avg_obi": avg_obi_agg,
                    "sum_price_volume": sum_price_volume,
                    "sum_price_sq_volume": sum_price_sq_volume,
                    "monetary_delta": monetary_delta_agg,
                    "volume_delta": volume_delta_agg,
                    "pv_list": pv_list_agg,
                    "complete": complete
                })

            current_time = next_time
            
        # 对 session_result 进行累积计算
        if session_result:
            session_df = pd.DataFrame(session_result)
            session_df["acc_vol"] = session_df["volume"].cumsum()  # 在 session 内累积 volume
            session_df["acc_price_volume"] = session_df["sum_price_volume"].cumsum()  # 累积 sum_price_volume
            session_df["acc_price_sq_volume"] = session_df["sum_price_sq_volume"].cumsum()  # 累积 sum_price_sq_volume
            result.extend(session_df.to_dict('records'))

    # 建立新的 DataFrame
    agg_df = pd.DataFrame(result)
    agg_df.set_index("ts", inplace=True, drop=False)
    agg_df = agg_df.shift(1).dropna()

    return agg_df

def combine_daily_k_bars(df):
    df = df.copy()
    df.index = pd.to_datetime(df.index)
    
    # 提取日期部分
    df['date'] = df.index.date
    
    # 按日期分組並聚合，並強制 complete=True
    combined = df.groupby('date').agg({
        'open': 'first',      # 當天第一根 K 棒的開盤價
        'high': 'max',        # 當天最高價
        'low': 'min',         # 當天最低價
        'close': 'last',     # 當天最後一根 K 棒的收盤價
        'volume': 'sum',      # 當天總成交量
        'complete': lambda x: True  # 強制設為 True
    })
    
    combined['amount'] = combined['close'] * combined['volume']
    
    # 重置 index 並設置為當天最後一根 K 棒的時間
    combined = combined.reset_index()
    combined['ts'] = pd.to_datetime(combined['date'])
    combined = combined.set_index('ts')
    combined = combined.drop('date', axis=1)
    
    return combined.shift(1).dropna()

def process_multiple_datasets(dataset1, dataset2, expensive_commodity, cheap_commodity):
    df1_list, df2_list, df3_list, df4_list, extra_list1, extra_list2 = [], [], [], [], [], []

    # 处理 tick 数据
    for type_of_data, data_name, date_start, date_end, data_type in dataset1:
        df = prepare_data(type_of_data, data_name)
        df, extra_data = pre_process(df, data_type, date_start, date_end)
        if data_name.startswith(expensive_commodity):
            df1_list.append(df)
            extra_list1.append(extra_data)
        elif data_name.startswith(cheap_commodity):
            df2_list.append(df)
            extra_list2.append(extra_data)

    for type_of_data, data_name in dataset2: # 如要使用shioaji的1分K資料
        df = prepare_data(type_of_data, data_name)
        df = df.set_index('ts')
        if data_name.startswith(expensive_commodity + 'k'):
            df3_list.append(df)
        elif data_name.startswith(cheap_commodity + 'k'):
            df4_list.append(df)
    
    # 合并 df1 和 df2
    df1 = pd.concat(df1_list) if df1_list else pd.DataFrame()
    df2 = pd.concat(df2_list) if df2_list else pd.DataFrame()
    df3 = pd.concat(df3_list) if df3_list else pd.DataFrame()
    df4 = pd.concat(df4_list) if df4_list else pd.DataFrame()
    extra_data1 = pd.concat(extra_list1) if extra_list1 else pd.DataFrame()
    extra_data2 = pd.concat(extra_list2) if extra_list2 else pd.DataFrame()
    
    # 对 df1 和 df2 进行按秒分组，保留最后一笔（处理重复时间戳）
    if not df1.empty:
        df1 = df1.groupby(df1.index).last()
    if not df2.empty:
        df2 = df2.groupby(df2.index).last()

    # 对 df3 和 df4 进行按时间戳分组，保留最后一笔（处理重复时间戳）
    if not df3.empty:
        df3 = df3.groupby(df3.index).last()
    if not df4.empty:
        df4 = df4.groupby(df4.index).last()

    # 合并 df1 和 df2
    df = pd.merge(df1, df2, left_index=True, right_index=True, how='inner', suffixes=('_df1', '_df2'))

    # 按时间索引排序并删除缺失值
    df = df.sort_index().dropna()

    # 对 df3 和 df4 按时间索引排序并删除缺失值
    if not df3.empty:
        df3 = df3.sort_index().dropna()
        df3 = pd.merge(
            df3,
            extra_data1,
            left_index=True,
            right_index=True,
            how="inner"
        )
    if not df4.empty:
        df4 = df4.sort_index().dropna()
        df4 = pd.merge(
            df4,
            extra_data2,
            left_index=True,
            right_index=True,
            how="inner"
        )

    return df, df3, df4

dataset1 = [
    ('shioaji/2025_0108', 'SOFR1', '2024-12-31 15:00:00', '2025-08-04 13:45:00', 'f_night'),
    ('shioaji/2025_0108', 'ZEFR1', '2024-12-31 15:00:00', '2025-08-04 13:45:00', 'f_night')
]

dataset2 = [
    ('shioaji/2025_0108', 'SOFR1k'),
    ('shioaji/2025_0108', 'ZEFR1k')
]

df, df3, df4 = process_multiple_datasets(dataset1, dataset2, 'ZEFR1', 'SOFR1') # 放入貴的, 便宜的, 數值大的在分母, 使得數值可以相除在0~1

print("Index:", df.head(3).index.tolist())
print("Columns:", df.columns.tolist())

[1;33mThank you for using SmartMoneyConcepts! ⭐ Please show your support by giving a star on the GitHub repository: [4;34mhttps://github.com/joshyattridge/smart-money-concepts[0m
Index: [Timestamp('2024-12-31 17:29:06'), Timestamp('2024-12-31 19:51:07'), Timestamp('2024-12-31 20:00:01')]
Columns: ['close_df1', 'volume_df1', 'bid_price_df1', 'ask_price_df1', 'flow_imbalance_df1', 'avg_spread_df1', 'avg_obi_df1', 'sum_price_volume_df1', 'sum_price_sq_volume_df1', 'pv_list_df1', 'monetary_delta_df1', 'volume_delta_df1', 'close_df2', 'volume_df2', 'bid_price_df2', 'ask_price_df2', 'flow_imbalance_df2', 'avg_spread_df2', 'avg_obi_df2', 'sum_price_volume_df2', 'sum_price_sq_volume_df2', 'pv_list_df2', 'monetary_delta_df2', 'volume_delta_df2']


In [2]:
def calculate_macd(price_series, fast_period=12, slow_period=26, signal_period=9):
    # --- 核心計算 ---
    # 直接對傳入的 Series 進行計算
    # 計算快線 EMA
    ema_fast = price_series.ewm(span=fast_period, adjust=False).mean()
    
    # 計算慢線 EMA
    ema_slow = price_series.ewm(span=slow_period, adjust=False).mean()
    
    # 計算 MACD 線 (DIF)
    macd_line = ema_fast - ema_slow
    
    # 計算訊號線 (DEM)
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    
    # 計算 MACD 柱狀圖 (Histogram)
    histogram = macd_line - signal_line

    # --- 找出交叉點 ---
    # 比較當前時間點和前一時間點的 MACD 與 Signal 線的相對位置
    prev_macd = macd_line.shift(1)
    prev_signal = signal_line.shift(1)

    # 判斷黃金交叉的條件
    is_golden_cross = (macd_line > signal_line) & (prev_macd <= prev_signal)

    # 判斷死亡交叉的條件
    is_death_cross = (macd_line < signal_line) & (prev_macd >= prev_signal)
    
    # 使用 np.select 根據條件賦值：黃金交叉為 1，死亡交叉為 -1，其餘為 0
    cross = np.select(
        [is_golden_cross, is_death_cross], 
        [1, -1], 
        default=0
    )

    # --- 準備最終回傳的結果 ---
    # 建立一個新的 DataFrame 來存放所有計算結果
    result_df = pd.DataFrame({
        'MACD': macd_line,
        'Signal': signal_line,
        'Histogram': histogram,
        'Cross': cross
    })
    result_df.index = price_series.index
    return result_df

def align_time_series(df1, df2):
    """
    對齊兩個時間序列 DataFrame 的索引，只保留共有的時間點(K)。
    
    參數:
        df1 (pd.DataFrame): 第一個時間序列 DataFrame
        df2 (pd.DataFrame): 第二個時間序列 DataFrame
        
    返回:
        tuple: (對齊後的 df1, 對齊後的 df2) => K
    """
    # 確保索引是 datetime 格式
    if not isinstance(df1.index, pd.DatetimeIndex):
        df1.index = pd.to_datetime(df1.index)
    if not isinstance(df2.index, pd.DatetimeIndex):
        df2.index = pd.to_datetime(df2.index)
    
    # 找出共同的時間點（索引交集）
    common_index = df1.index.intersection(df2.index)
    
    # 檢查是否有共同時間點
    if len(common_index) == 0:
        raise ValueError("No common timestamps found between df1 and df2")
    
    # 對齊兩個 DataFrame 的索引
    df1_aligned = df1.loc[common_index]
    df2_aligned = df2.loc[common_index]
    
    return df1_aligned, df2_aligned


In [3]:
K_time = 60
CODE = ['ZEFR', 'SOFR']
df1_k = convert_ohlcv(df3, K_time)
df2_k = convert_ohlcv(df4, K_time)
df1_k, df2_k = align_time_series(df1_k, df2_k)
df1_k = df1_k.rename(columns={col: col + f'_{CODE[0]}' for col in df1_k.columns})
df2_k = df2_k.rename(columns={col: col + f'_{CODE[1]}' for col in df2_k.columns})
df_all = df1_k.join(df2_k, how='inner')
cols_to_drop = [col for col in df_all.columns if col.startswith('ts_')]

df_all['spread'] = np.log1p(df_all[f'close_{CODE[0]}'] / df2_k[f'close_{CODE[1]}'])
df_all['flow_spread'] = df_all[f"flow_imbalance_{CODE[0]}"] - df_all[f"flow_imbalance_{CODE[1]}"]
df_all['avg_spread'] = df_all[f"avg_spread_{CODE[0]}"] - df_all[f"avg_spread_{CODE[1]}"]
df_all['avg_obi_spread'] = df_all[f"avg_obi_{CODE[0]}"] - df_all[f"avg_obi_{CODE[1]}"]
df_all['sum_price_volume_spread'] = df_all[f"sum_price_volume_{CODE[0]}"] - df_all[f"sum_price_volume_{CODE[1]}"]
df_all['sum_price_sq_volume_spread'] = df_all[f"sum_price_sq_volume_{CODE[0]}"] - df_all[f"sum_price_sq_volume_{CODE[1]}"]
df_all['monetary_delta_spread'] = df_all[f"monetary_delta_{CODE[0]}"] - df_all[f"monetary_delta_{CODE[1]}"]
df_all['volume_delta_spread'] = df_all[f"volume_delta_{CODE[0]}"] - df_all[f"volume_delta_{CODE[1]}"]

df_all = df_all.drop(columns=cols_to_drop).dropna()
print(len(df_all))

1636


In [4]:
def fft_decompose(signal, fs):
    N = len(signal)
    yf = fft(signal)
    xf = fftfreq(N, 1/fs)  # 頻率陣列，長度 N，包含正負頻率
    amp = 2.0 / N * np.abs(yf)  # 振幅譜（雙邊轉單邊）
    phase = np.angle(yf)  # 相位譜
    # 只取正頻率部分 (xf[0] 是 DC，xf[1:N//2] 是正頻率)
    return xf[:N//2], amp[:N//2], phase[:N//2], yf

def circular_weighted_mean(phases, weights):
    if len(phases) == 0:
        return np.nan, 0.0  # 空資料回傳 NaN
    
    weights = np.array(weights, dtype=float)
    if np.all(weights == 0):
        vec = np.mean(np.exp(1j * phases))
    else:
        vec = np.sum(weights * np.exp(1j * phases)) / np.sum(weights)
        
    mean_phase = np.angle(vec)
    R = np.abs(vec)
    return mean_phase, R

def lookback_transform(df_all, **params):
    k_lookback = params.get('k_lookback')
    results = []
    per_freq_summary = []
    delta_t = params.get('k_time')
    fs = 1 / delta_t
    
    for i in range(0, len(df_all)):
        end_idx = min(i + k_lookback, len(df_all))
        df_subset = df_all.iloc[i:end_idx].copy()
        
        a = calculate_macd(df_subset[f'close_{CODE[0]}'], 8, 12, 5)['Histogram'].dropna()
        b = calculate_macd(df_subset[f'close_{CODE[1]}'], 8, 12, 5)['Histogram'].dropna()

        xf_a, amp_a, phase_a, yf_a = fft_decompose(a.values, fs)
        xf_b, amp_b, phase_b, yf_b = fft_decompose(b.values, fs)
        
        # 計算每個頻率的相位差 (-pi..pi)
        phase_diff = (phase_a - phase_b + np.pi) % (2*np.pi) - np.pi

        # 使用 cosine 背離方法計算每個頻率背離強度
        divergence_per_freq = (amp_a + amp_b) * (1 - np.cos(phase_diff)) / 2

        # scalar summary：總背離強度
        total_divergence = np.sum(divergence_per_freq)

        # 加權相位差（領先/落後指標）
        weights = amp_a + amp_b
        weighted_phase_diff, R = circular_weighted_mean(phase_diff, weights)
        
        # 將加權相位差轉成時間延遲（秒與 K 單位）
        if np.sum(weights) == 0:
            mean_freq = 0.0
        else:
            mean_freq = np.sum(weights * xf_a) / np.sum(weights)
        if mean_freq == 0:
            time_lag_seconds = np.nan
            time_lag_in_K = np.nan
        else:
            time_lag_seconds = weighted_phase_diff / (2 * np.pi * mean_freq)
            time_lag_in_K = time_lag_seconds / delta_t
        
        # 記錄結果於當前結束K 
        results.append({'ts': df_subset.index[-1], 'total_divergence': total_divergence})
        
        # 儲存 per-frequency 背離與相位
        per_freq_summary.append({
            'ts': df_subset.index[-1],
            'xf': xf_a,
            'amp_a': amp_a,
            'amp_b': amp_b,
            'phase_a': phase_a,
            'phase_b': phase_b,
            'phase_diff': phase_diff,
            'divergence_per_freq': divergence_per_freq,
            'weighted_phase_diff': weighted_phase_diff,
            'phase_consistency': R,
            'time_lag_seconds': time_lag_seconds,
            'time_lag_in_K': time_lag_in_K
        })

    per_freq_summary_df = pd.DataFrame(per_freq_summary).set_index('ts')
    result_df = pd.DataFrame(results).set_index('ts')
    return result_df, per_freq_summary_df

params = {
    'k_lookback': 15,
    'k_time': K_time
}

ftt, per_freq = lookback_transform(df_all, **params)
df_all = df_all.join(ftt, how='inner')

In [5]:
def map_to_normal_distribution(similarity, scale=1):
    """
    將相似度值（0~1）映射到類似常態分佈，均值為0，標準差可調。
    用於 cosine 相似度，高相似度 (接近 1) 映射到中心 (0)。
    """
    inverted_similarity = 1.0 - similarity
    z_score = norm.ppf(inverted_similarity * (1 - 1e-6) + 1e-6)
    return z_score * scale

def map_manhattan_to_normal(distance, max_distance=3, scale=1):
    """
    將 Manhattan 距離映射到類似常態分佈，均值為0，標準差可調。
    distance: 輸入的 Manhattan 距離值
    max_distance: 用於正規化的最大距離值
    scale: 控制輸出分佈的標準差
    """
    # 正規化距離到 [0, 1]
    normalized_distance = np.clip(distance / max_distance, 0.0, 1.0)
    # 將正規化距離映射到標準正態分佈的分位數
    z_score = norm.ppf(normalized_distance * (1 - 1e-6) + 1e-6)
    return z_score * scale

def map_euclidean_to_normal(distance, max_distance=3, scale=1):
    """
    將 Euclidean 距離映射到類似常態分佈，均值為0，標準差可調。
    distance: 輸入的 Euclidean 距離值
    max_distance: 用於正規化的最大距離值
    scale: 控制輸出分佈的標準差
    """
    # 正規化距離到 [0, 1]
    normalized_distance = np.clip(distance / max_distance, 0.0, 1.0)
    # 將正規化距離映射到標準正態分佈的分位數
    z_score = norm.ppf(normalized_distance * (1 - 1e-6) + 1e-6)
    return z_score * scale

def simulate_realtime_vector(df_all, **params):
    k_lookback = params.get('k_lookback', 3)  # 預設 3 根 K
    code = params.get('code')
    results = []
    
    for i in range(0, len(df_all), k_lookback):
        end_idx = min(i + k_lookback, len(df_all))
        df1_subset = df_all[[f"open_{code[0]}", f"close_{code[0]}", f"low_{code[0]}", f"high_{code[0]}"]].iloc[i:end_idx]
        df2_subset = df_all[[f"open_{code[1]}", f"close_{code[1]}", f"low_{code[1]}", f"high_{code[1]}"]].iloc[i:end_idx]
        df1_subset.columns = ['open', 'close', 'low', 'high']
        df2_subset.columns = ['open', 'close', 'low', 'high']
        df1_copy = df1_subset.copy()
        df2_copy = df2_subset.copy()
        
        # 如果不足 k_lookback，跳過
        if len(df1_copy) < k_lookback or len(df2_copy) < k_lookback:
            continue
        
        # ----------------- A 組計算 -----------------
        lowA = df1_copy['low'].min()
        highA = df1_copy['high'].max()
        totalA = highA - lowA

        overlap_lowA = df1_copy['low'].max()
        overlap_highA = df1_copy['high'].min()
        overlap_lenA = max(0.0, overlap_highA - overlap_lowA)
        nonoverlapA = max(0.0, totalA - overlap_lenA) / totalA

        # 每根K棒的總長度
        rangeA = df1_copy['high'] - df1_copy['low'] + 1e-9   # 避免 0 除錯

        # 上下影線（比例）
        upper_shadows_A = (df1_copy['high'] - df1_copy[['open', 'close']].max(axis=1)) / rangeA
        lower_shadows_A = (df1_copy[['open', 'close']].min(axis=1) - df1_copy['low']) / rangeA

        # ----------------- B 組計算 -----------------
        lowB = df2_copy['low'].min()
        highB = df2_copy['high'].max()
        totalB = highB - lowB

        overlap_lowB = df2_copy['low'].max()
        overlap_highB = df2_copy['high'].min()
        overlap_lenB = max(0.0, overlap_highB - overlap_lowB)
        nonoverlapB = max(0.0, totalB - overlap_lenB) / totalB

        # 每根K棒的總長度
        rangeB = df2_copy['high'] - df2_copy['low'] + 1e-9

        # 上下影線（比例）
        upper_shadows_B = (df2_copy['high'] - df2_copy[['open', 'close']].max(axis=1)) / rangeB
        lower_shadows_B = (df2_copy[['open', 'close']].min(axis=1) - df2_copy['low']) / rangeB

        # -------- 建立向量 --------
        vecA = np.concatenate([[nonoverlapA], upper_shadows_A.values, lower_shadows_A.values])

        vecB = np.concatenate([[nonoverlapB], upper_shadows_B.values, lower_shadows_B.values])

        # -------- 小擾動避免完全相同 --------
        vecA += np.random.normal(0, 1e-6, size=vecA.shape)
        vecB += np.random.normal(0, 1e-6, size=vecB.shape)

        vecA = np.log1p(vecA)
        vecB = np.log1p(vecB)
        
        # -------- 距離計算 --------
        manhattan_dist = map_manhattan_to_normal(cityblock(vecA, vecB))
        euclidean_dist = map_euclidean_to_normal(euclidean(vecA, vecB))
        cosine_dist = map_to_normal_distribution(1 - cosine(vecA, vecB))
        
        # -------- 輸出結果 --------
        # print("A 組向量:", vecA)
        # print("B 組向量:", vecB)
        # print("Manhattan distance:", manhattan_dist)
        # print("Euclidean distance:", euclidean_dist)
        # print("Cosine distance:", cosine_dist, "\n\n")

        results.append({'ts': df1_subset.index[-1], 'similarity': cosine_dist})
    
    result_df = pd.DataFrame(results).set_index('ts')
    return result_df

bt_params = {
    'k_lookback': 3,
    'code': CODE
}

data = simulate_realtime_vector(df_all, **bt_params)
data = data.replace([np.inf, -np.inf], 0).dropna()
df_all = df_all.join(data, how='left')

In [6]:
qt = QuantileTransformer(output_distribution='normal')

df_all['flow_spread_trans'] = qt.fit_transform(df_all[['flow_spread']])
df_all['avg_spread_trans'] = qt.fit_transform(df_all[['avg_spread']])
df_all['avg_obi_spread_trans'] = qt.fit_transform(df_all[['avg_obi_spread']])
df_all['sum_price_volume_spread_trans'] = qt.fit_transform(df_all[['sum_price_volume_spread']])
df_all['sum_price_sq_volume_spread_trans'] = qt.fit_transform(df_all[['sum_price_sq_volume_spread']])
df_all['monetary_delta_spread_trans'] = qt.fit_transform(df_all[['monetary_delta_spread']])
df_all['volume_delta_spread_trans'] = qt.fit_transform(df_all[['volume_delta_spread']])
df_all['ftt_trans'] = np.arctan(np.sqrt(df_all['total_divergence']))

In [8]:
import numpy as np
import json, os, random, re
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

JSON_FILE = f'./history.json'

llm = ChatOpenAI(
    base_url="http://localhost:8000/v1",  # 你的本地服務器地址
    api_key="sk-no-key-required",  # 如果不需要 API key，用占位符
    model="meta-llama/Llama-3.3-70B-Instruct",  # 你的 Hugging Face 模型名稱
    temperature=0  # 根據需求調整
)
def clean_think_content(text: str) -> str:
    # 將 <think> 到 </think> 中間的內容整段移除
    cleaned_text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    # 去掉前後多餘空白
    return cleaned_text.strip()

def pv_list_profile(pv_list_df, bins=5):
    all_prices = []
    all_vols = []

    # 收集所有價位/量
    for col in pv_list_df.columns:
        for pv_list in pv_list_df[col]:
            if isinstance(pv_list, (list, tuple)):
                for d in pv_list:
                    for price, vol in d.items():
                        all_prices.append(price)
                        all_vols.append(vol)

    if not all_prices:
        return None

    df = pd.DataFrame({'price': all_prices, 'vol': all_vols})

    # 分 bin
    min_p, max_p = df['price'].min(), df['price'].max()
    bins_edges = np.linspace(min_p, max_p, bins+1)
    df['bin'] = pd.cut(df['price'], bins=bins_edges, include_lowest=True)
    profile = df.groupby('bin', observed=False)['vol'].sum().reset_index()
    return profile.to_dict(orient='records')

# 將 LLM 分析的 for loop 封裝成一個 function
def process_windows(df_all, llm, json_file, batch_size=10, factor_split_size=30):
    # 如果 JSON 已存在，刪除它以重新開始
    if os.path.exists(json_file):
        print(f"找到舊檔 {json_file}，將其刪除並重新開始分析。")
        os.remove(json_file)

    # 無論如何都從一個空的 list 開始
    summaries = []
    
    # 系統提示：更詳細地定義 AI 角色、輸入與輸出
    system_prompt = """
        你是一位頂尖的量化交易分析師，其務是對單一時間點的市場「橫截面快照」進行**法務鑑識等級的歸因分析**。
        你的目標是利用分析期(time_window)的因子和spread數據，**直接、明確地解釋**, 預測期(next_window)的未來報酬 (return) 是如何產生的。
        交易目標是「發散型配對交易」, 並最終形成一個具體的統計套利關係結果。

        【限制】:
        1. **僅使用輸入數據**：
            - 只能使用 `factor_group` 中提供的因子名稱。
            - **禁止引入任何未提供的指標或策略方法**（如 KD, MACD, Bollinger Bands等）。
            - 分析語言必須完全基於數據，不得包含策略、理論或技術分析方法描述。
        2. **你的所有分析都必須明確引用輸入數據中的「時間範圍」，將你的發現錨定在具體的時間上下文中, 並且在執行分析的時候, 必須使用最左欄的 ts 作為時間軸, 依照ts時間由舊到新, 審視時間上的變化。**
        
        【數據結構定義】:
        你收到的數據分為兩部分：
        1.  **分析期 (time_window)**: 時間範圍 (time_window), 當前spread走勢及其包含的所有因子時間序列數據。你的所有分析都基於這個時期的數據變化。
        2.  **預測期 (next_window)**: 時間範圍 (next_window), return的數值是未來的時間。這是分析期 (time_window) 行為所導致的「結果」。
        
        【任務】：
        1.**你的任務就是連接 time_window spread走勢和factor_group的「因」vs next_window 的return走勢「果」。**
        2. 使用「分析期走勢」去預測「預測期走勢」
        
      
        【請遵守以下規則】：
        **第一步事實陳述:趨勢與事件 (Factual Statement: Trends & Events)**
        -   **任務**: 在進行任何分析前，首先對輸入的因子進行事實分類和陳述。
        -   **輸出格式**:
            -   **1a. 連續型因子趨勢**: 對於**連續變化**的因子，按以下格式逐行列出：
                -   **因子 [因子名稱]**: 在分析期time_window, (`[時間範圍起始]` 到 `[時間範圍結束]`)，其數值從 `[起始數值]` **變化為** `[結束數值]`。
                -   **中間變化包括**： 簡要描述關鍵轉折點或子趨勢，例如 '在[時間點]達到峰值[數值]後下降' 或 '多次正負交叉, 整體盤勢為 `[上升/下降/震盪]`。
                -   **盤勢判定範例**: 
                    - 上升: [10, 11, 10.5, 11.5, 12, 11, 13]
                    - 下降: [10, 9.5, 9, 8.5, 8, 8, 7]
                    - 震盪: [10, 11, 10, 9.5, 9, 10, 10.5]
                    
            -   **1b. 事件型因子出現**: 對於**通常為 N/A 的稀有因子**（如 similarity），按以下格式陳述：
                -   **因子 [因子名稱]**: 在分析期time_window, 的 `[出現的具體時間點]`，出現`[數值]`。
                -   **因子變化**: 紀錄再因子出現後, 和spread之間的變化
                    [紀錄範例]
                        similarity數據: 2025-01-01 11:00 => 0.7
                        本次分析期 2025-01-01 9:00 ~ 2025-01-01 11:00
                        在11:00時, 因子similarity出現0.7

        **第二步核心歸因：趨勢或事件驅動 (Core Attribution: Trend or Event-Driven)**
        -   **任務**: 
            - 建立一條從「因子變化」或「市場事件」到「未來報酬」的直接因果鏈。
            - **如果第一步中識別出「市場事件」，你必須優先使用「事件驅動歸因」範本來構建你的解釋。**
            - 把因子的整體走勢, 或者事件出現後因子的變化彼此的關係進行總結。
            
        -   **輸出格式**: 請參考以下範本，為分別生成歸因陳述：
            -   **範本一：因子全為連續型因子**
                -   **歸因**:
                    -   **預測目標**: 預測期(next_window) 的 `return` 為 `[填入 return 的值]`。
                    -   **核心歸因**: 
                        - 此未來報酬**直接歸因於**分析期(time_window) (`[時間範圍]`) 內 **[因子名稱]** 的趨勢。**具體來說**，該因子從 `[起始時間]` 到 `[結束時間]`，其數值從 `[起始數值]` **變化為** `[結束數值]`，顯示出強烈的 `[看漲/看跌]` 信號。此信號因當時價格處於 `[盤勢情境]` 而被 `[增強/減弱]`。
                        - 分析此段時間內因子的盤勢:
                        [紀錄範例]
                            本次分析期 2025-01-01 9:00 ~ 2025-01-01 12:00
                            flow_spread_trans 和 avg_spread_trans盤勢皆為震盪
                            avg_obi_spread_trans盤勢為上升 和 flow_spread_trans盤勢為下降, 兩因子走勢整體出現背離
                            
            -   **範本二：因子由事件因子與連續因子組成**
                -   **歸因**:
                    -   **預測目標**: 預測期(next_window) 的 `return` 為 `[填入 return 的值]`。
                    -   **核心歸因**: 此未來報酬被認為是**對分析期 (time_window) 內一個關鍵市場事件的直接反應**：
                        -   **觸發事件**: 在 `[事件發生的具體時間]`，**`[事件因子名稱]`** 出現，出現`[數值]` 信號 (數值為 `[因子值]`)。
                        -   **衝擊分析**: 這個高強度事件的出現，其信號意義和其他連續型因子的影響, 在時間範圍內的因子在時間段內同步(異步)盤勢, 直接對未來報酬走勢有影響。
                           [範例]
                                本次分析期 2025-01-01 9:00 ~ 2025-01-01 12:00
                                在similarity出現0.7的時刻(11:00)
                                flow_spread_trans的數值由`[前一個時刻(10:00)]`到`[後續時刻(11:00後)]`, 數值由`[數值]`變換為`[數值]`, 因子similarity出現後flow_spread_trans數值上升
                                avg_obi_spread_trans的數值由`[前一個時刻(10:00)]`到`[後續時刻(11:00後)]`, 數值由`[數值]`變換為`[數值]`, 因子similarity出現後avg_obi_spread_trans數值下降
                                avg_spread_trans的數值由`[前一個時刻(10:00)]`到`[後續時刻(11:00後)]`, 數值由`[數值]`變換為`[數值]`, 因子similarity出現後avg_spread_trans數值先上升(到11:30), 後又下降(到12:00)

        **第三步因果關係分析: 統計套利綜合研判 (Statistical Arbitrage Synthesis)**
        -   **任務**: 綜合第二步對因子的歸因（無論是趨勢還是事件驅動），形成一個最終的、可操作的「發散型配對交易」建議。
        -   **輸出格式**: 你必須輸出以下內容, 包含驅動原因, 信心強度. 貢獻排名：
            -   **綜合交易建議**:
                -   **核心邏輯**: 當前spread的走勢主要由 **`[趨勢/事件：描述spread主要驅動，例如：因子flow_spread_trans的下降趨勢 + similarity變高]`** 驅動而導致未來的return走勢呈現 `[看漲/看跌]`。
                -   **信心強度**: `[高 / 中 / 低]` (如果由「市場事件」驅動，信心強度通常更高)
                -   **關鍵觀察點**: 觀察(time_window)的spread走勢和因子, 對於未來return的關係, 使得未來return走勢會上漲或下跌。
                -   **對於本次factor_group中有預測貢獻的因子進行排名輸出**
                    [貢獻輸出輸出格式舉例如下:]
                        本次分析期 2025-01-01 9:00 ~ 2025-01-01 12:00 (spread)
                        輸入因子組: ['flow_spread_trans', 'avg_spread_trans', 'avg_obi_spread_trans']
                        對於未來走勢預測貢獻排名:
                        1. flow_spread_trans
                        2. avg_spread_trans
                        3. avg_obi_spread_trans

        請根據以上框架，對接下來的數據進行分析，確保所有結論都包含明確的時間參考, 需要再輸出時, 將因子變化的時間段和未來報酬的時間段, 也要一併輸出便於後續檢驗。
      """
      
    # Step 1: 分離 OHLCV 和 因子
    ohlc_cols = ['spread']
    factor_cols = [
        'flow_spread_trans', 'avg_spread_trans', 'avg_obi_spread_trans', 
        'sum_price_volume_spread_trans', 'sum_price_sq_volume_spread_trans',
        'monetary_delta_spread_trans', 'volume_delta_spread_trans', 'ftt_trans', 'similarity'
    ]
    
    # 提取 df_all 中存在的欄位
    available_cols = set(df_all.columns)
    k_cols = [col for col in ohlc_cols if col in available_cols]
    selected_factor_cols = [col for col in factor_cols if col in available_cols]

    # Step 3: 拆分因子成多個小組（隨機打散再切割）
    random.shuffle(selected_factor_cols)
    factor_groups = [selected_factor_cols[j:j + factor_split_size] for j in range(0, len(selected_factor_cols), factor_split_size)]

    for i in range(batch_size, len(df_all), batch_size):
        # window_prev: 從當前位置 i 往前取 batch_size 筆數據
        window_prev = df_all.iloc[i - batch_size:i]

        # window_next: 從當前位置 i 往後取 batch_size 筆數據
        window_next = df_all.iloc[i:i + batch_size]
        
        if window_next.empty:
            break  # 如果窗口空了，結束

        start_time = window_prev.index[0].strftime('%Y-%m-%d %H:%M:%S')
        end_time = window_prev.index[-1].strftime('%Y-%m-%d %H:%M:%S')
        time_window = f"{start_time} to {end_time}"
        prev_spread_data = window_prev[k_cols]
        
        start_next_time = window_next.index[0].strftime('%Y-%m-%d %H:%M:%S')
        end_next_time = window_next.index[-1].strftime('%Y-%m-%d %H:%M:%S')
        next_window = f"{start_next_time} to {end_next_time}"
        next_spread_data = window_next[k_cols]

        for g_idx, factor_group in enumerate(factor_groups):
            partial_factors_str = window_prev[factor_group].to_string(index=True, max_rows=None)

            # --- 步驟 3: 將 DataFrame 轉換為結構化的純文字表格 ---
            human_prompt_content = f"""
                請分析以下時間窗口的數據：
                **商品組**: A={CODE[0]}, B={CODE[1]}

                **分析期時間範圍**: {time_window}
                    - K棒
                    spread={prev_spread_data}
                    - 因子 (factor)
                    factor_gruop={partial_factors_str}
                
                **預測期時間範圍**: {next_window}
                    - Return (未來報酬參考)
                    return={next_spread_data}

                請根據以上數據，使用「分析時間段走勢」去預測「預測期時間走勢」。
            """

            # 建立 ChatPromptTemplate
            chat_prompt = ChatPromptTemplate.from_messages([
                ("system", system_prompt),
                ("human", "{input_data}")  # 使用變量占位符
            ])
            chain = chat_prompt | llm

            # 呼叫 LLM
            print(f"正在分析窗口：{time_window}...")
            try:
                response = chain.invoke({"input_data": human_prompt_content})
                summary = clean_think_content(response.content.strip())  # 假設 clean_think_content 已定義
            except Exception as e:
                summary = f"錯誤：{e}"
                print(f"呼叫 LLM 時發生錯誤: {e}")

            # 存成 dict
            summaries.append({
                "time_window": time_window,
                "factor_group": f"{g_idx} - {factor_group}",
                "analysis": summary
            })

            # 立即 append 到 JSON (避免中斷遺失)
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(summaries, f, ensure_ascii=False, indent=4)
            
        print(f"完成窗口：{time_window}")

    print("\n預處理完成，所有總結存於", json_file)
    return summaries

summaries = process_windows(df_all, llm, JSON_FILE, batch_size=10, factor_split_size=3)

NameError: name 'SystemMessage' is not defined