In [2]:
from smartmoneyconcepts import smc
# from renko import Renko
import pandas as pd
import statsmodels.api as sm
from datetime import datetime, timedelta, time
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer
from scipy.fft import fft, fftfreq
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import mplfinance as mpf
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.stattools import adfuller, coint
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from collections import defaultdict
import math, re, itertools
from functools import reduce
from scipy import stats
from arch.unitroot import KPSS
from hurst import compute_Hc
from statsmodels.tsa.vector_ar.vecm import VECM
import seaborn as sns
from scipy.integrate import quad
from statsmodels.stats.diagnostic import acorr_ljungbox
from scipy.signal import hilbert
from scipy.spatial.distance import cityblock, euclidean, cosine
from scipy.stats import gaussian_kde, shapiro, probplot, skew, kurtosis, norm, jarque_bera, anderson, normaltest, entropy, variation
from scipy.special import logit, expit
# from finmlkit.bar.base import TradesData
# from finmlkit.bar.kit import VolumeBarKit, TickBarKit, DollarBarKit, TimeBarKit, CUSUMBarKit
# from finmlkit.feature.kit import Feature, FeatureKit, Compose
# from finmlkit.feature import transforms as ts
# from finmlkit.feature.base import SISOTransform, SIMOTransform, MISOTransform, BaseTransform, MIMOTransform
# from finmlkit.feature.core.volatility import parkinson_range
# from finmlkit.feature.core.momentum import roc
import talib

def prepare_data(type_of_data, data_name):
    result = type_of_data.split('/')[0]
    tmp = pd.read_csv(f'../index_data/{type_of_data}/{data_name}.csv')
    if result == 'shioaji':
        tmp['ts'] = pd.to_datetime(tmp['ts'])
        tmp = tmp.rename(columns=lambda x: x.lower())
    else:
        tmp['ts'] = pd.to_datetime(tmp['datetime'])
        tmp = tmp.rename(columns=lambda x: x.lower())
    
    return tmp

def aggregate_pv_list(pv_lists):
    if pv_lists.isna().all():
        return np.nan
    
    combined_volume = defaultdict(float)
    for pv in pv_lists.dropna():
        if isinstance(pv, list):  # 確保是 list
            for item in pv:
                if isinstance(item, dict):
                    for price, vol in item.items():
                        combined_volume[float(price)] += float(vol)  # 轉 float 避免類型問題

    # 按價格降序排序並轉為 list of dict
    aggregated_pv = [{price: combined_volume[price]} for price in sorted(combined_volume, reverse=True)]
    return aggregated_pv

def filter_market_data(bid_list, ask_list, bid_vol_list, ask_vol_list, check_abnormal=False):
    valid_data = []
    abnormal_data = []

    # 使用 zip 將四個列表的對應元素打包成元組
    # zip 會自動處理列表長度不一的情況，以最短的為準
    for bid, ask, bid_vol, ask_vol in zip(bid_list, ask_list, bid_vol_list, ask_vol_list):
        
        # 判斷是否為異常數據
        # 只要滿足以下任一條件，即為異常
        is_abnormal = (
            ask < bid or          # 條件1: 賣價低於買價
            bid <= 0 or           # 條件2: 買價無效
            ask <= 0 or           # 條件2: 賣價無效
            bid_vol <= 0 or       # 條件2: 買量無效
            ask_vol <= 0          # 條件2: 賣量無效
        )

        if is_abnormal:
            abnormal_data.append((bid, ask, bid_vol, ask_vol))
        else:
            valid_data.append((bid, ask, bid_vol, ask_vol))

    # 如果發現了異常數據，則打印出來
    if abnormal_data and check_abnormal is True:
        print(f"发现 {len(abnormal_data)} 笔异常数据，已将其过滤：")
        for i, (b, a, bv, av) in enumerate(abnormal_data):
            print(f"  第{i+1}笔异常: 买价={b}, 卖价={a}, 买量={bv}, 卖量={av}")

    # 如果沒有任何有效的數據，返回四個空列表
    if not valid_data:
        return [], [], [], []

    # 將過濾後的有效數據解包，並返回四個新的列表
    valid_bids, valid_asks, valid_bid_vols, valid_ask_vols = zip(*valid_data)
    
    # zip返回的是元組(tuple)，我們將其轉為列表(list)再返回
    return list(valid_bids), list(valid_asks), list(valid_bid_vols), list(valid_ask_vols)

def analyze_tick_types(tick_type_series, volume_series):
    """
    分析該秒內的成交類型分布。
    外盤成交量為正數，內盤成交量為負數。
    """
    # 將每一筆的 tick_type (1 或 -1) 和 volume 相乘
    signed_volumes = [t * v for t, v in zip(tick_type_series, volume_series)]

    # 外盤成交量 = 所有正數的總和
    outer_vol = sum(vol for vol in signed_volumes if vol > 0)

    # 內盤成交量 = 所有負數的總和 (不取絕對值)
    inner_vol = sum(vol for vol in signed_volumes if vol < 0)

    return outer_vol + inner_vol

def calculate_liquidity_factors(bid_list, ask_list, bid_vol_list, ask_vol_list, vol_list):
    clean_bids, clean_asks, clean_bid_vols, clean_ask_vols = filter_market_data(
        bid_list, ask_list, bid_vol_list, ask_vol_list
    )
    
    if not clean_bids:  # 檢查任一列表即可，因為它們要麼全有數據，要麼全空
        return pd.Series({
            'avg_spread': 0.0,
            'avg_obi': 0.5  # 或者 0.5，取決於您對無數據時的 OBI 定義
        })
    
    try:
        # 初始化
        Wtd_Spread_Num = 0.0
        Wtd_OBI_Num = 0.0
        N = len(clean_bids)
        
        # 處理每一筆 tick
        for i in range(N):
            # 1. 計算即時價差 (Spread_i)
            spread_i = clean_asks[i] - clean_bids[i]

            # 2. 計算即時 OBI (OBI_i)
            # OBI = Vb / (Vb + Va)，注意避免分母為零
            bid_vol_i = clean_bid_vols[i]
            ask_vol_i = clean_ask_vols[i]

            den_i = bid_vol_i + ask_vol_i
            if den_i > 0:
                obi_i = bid_vol_i / den_i
            else:
                obi_i = 0.5 # 如果沒有掛單量，OBI 設為中性 0.5

            # 3. 獲取成交量作為權重 (Volume_Traded_i)
            vol_traded_i = vol_list[i]

            # 4. 累積加權分子總和
            Wtd_Spread_Num += spread_i * vol_traded_i
            Wtd_OBI_Num += obi_i * vol_traded_i
            
        return pd.Series({
            'avg_spread': Wtd_Spread_Num,
            'avg_obi': Wtd_OBI_Num
        })
        
    except (ValueError, TypeError):
        return pd.Series({
            'avg_spread': np.nan,
            'avg_obi': np.nan
        })

def calculate_price_factors(close_list, volume_list):
    """
    計算秒級價格因子。
    """
    try:
        sum_price_volume = sum((float(c)) * float(v) for c, v in zip(close_list, volume_list))
        sum_price_sq_volume = sum((float(c) ** 2) * float(v) for c, v in zip(close_list, volume_list))
        
        # 計算 pv_list：按 close 價格聚合成交量
        volume_by_close = defaultdict(float)
        for close, volume in zip(close_list, volume_list):
            volume_by_close[close] += float(volume)
        
        # 轉為 pv_list 格式並按 close 價格降序排序
        pv_list = [{close: volume} for close, volume in sorted(volume_by_close.items(), key=lambda x: x[0], reverse=True)]
        
        return pd.Series({
            'sum_price_volume': sum_price_volume,
            'sum_price_sq_volume': sum_price_sq_volume,
            'pv_list': pv_list
        })
    except (ValueError, ZeroDivisionError, TypeError, IndexError):
        return pd.Series({
            'sum_price_volume': np.nan,
            'sum_price_sq_volume': np.nan,
            'pv_list': np.nan
        })

def calculate_capital_factors(bid_list, ask_list, bid_vol_list, ask_vol_list):
    """
    計算秒級資金因子。
    """
    clean_bids, clean_asks, clean_bid_vols, clean_ask_vols = filter_market_data(
        bid_list, ask_list, bid_vol_list, ask_vol_list
    )
    
    if not clean_bids:
        return pd.Series({
            'monetary_delta': 0.0,
            'volume_delta': 0.0,
        })
    
    try:
        # 計算名義金額
        notional_bids = [p * v for p, v in zip(clean_bids, clean_bid_vols)]
        notional_asks = [p * v for p, v in zip(clean_asks, clean_ask_vols)]
        
        # 計算淨資金流向
        monetary_delta = sum(notional_bids) - sum(notional_asks)
        volume_delta = sum(clean_bid_vols) - sum(clean_ask_vols)
        
        return pd.Series({
            'monetary_delta': monetary_delta,  # CVD 聚合用
            'volume_delta': volume_delta,      # 可選，純數量 delta
        })
    
    except (ValueError, ZeroDivisionError, TypeError):
        return pd.Series({
            'monetary_delta': np.nan,
            'volume_delta': np.nan,
        })

def build_finmlkit_trade(df):
    origin_df = df.copy()
    origin_df['tick_type'] = origin_df['tick_type'].map({1: -1, 2: 1}).fillna(0).astype(np.int8)
    origin_df.index = pd.to_datetime(origin_df['ts'])
    ts_ns = origin_df.index.view(np.int64)
    
    trades_data = TradesData(
        ts=ts_ns,
        px=origin_df['close'].values,
        qty=origin_df['volume'].values,
        dt_index=origin_df.index,
        side=origin_df['tick_type'].values,
        timestamp_unit='ns'   # 明確告訴它單位是納秒
    )
    
    return trades_data

def pre_process_by_finmlkit_volbar(df, volume_threshold):
    trade = build_finmlkit_trade(df)
    volume_bar_kit = VolumeBarKit(
        trades=trade,
        volume_ths=volume_threshold
    )

    return volume_bar_kit

def pre_process_by_finmlkit_tickbar(df, n):
    trade = build_finmlkit_trade(df)
    tick_bar_kit = TickBarKit(
        trades=trade,
        tick_count_thrs=n
    )

    return tick_bar_kit

def pre_process_by_finmlkit_dollarbar(df, dollar_threshold):
    trade = build_finmlkit_trade(df)
    dollar_bar_kit = DollarBarKit(
        trades=trade,
        dollar_thrs=dollar_threshold # ✅ 每 1,000,000 (1_000_000) 元成交金額形成一根 bar（可調整）
    )

    return dollar_bar_kit

def pre_process_by_finmlkit_timebar(df, time):
    trade = build_finmlkit_trade(df)
    time_bar_kit = TimeBarKit(
        trades=trade,
        period=pd.Timedelta(time) # ✅ 每 1 分鐘形成一根 bar，可改'1min', '5s', '10s', '1h' 等
    )
    
    return time_bar_kit

def pre_process_by_finmlkit_sumbar(df, window, min_sigma, sigma_mult):
    trade = build_finmlkit_trade(df)
    
    # 使用 log-returns 與 rolling std，window 可調（例如 200 ticks）
    prices = df['close'].astype(float)
    logret = np.log(prices).diff().fillna(0.0)
    
    # rolling std on returns (align with ticks)
    rolling_std = logret.rolling(window=window, min_periods=1, center=False).std().to_numpy(dtype=np.float64)
    
    # 如果 rolling_std 太小或為 0，會在 CUSUM 裡用 sigma_floor 修正
    sigma_vector = rolling_std 
    
    cusum_bar_kit = CUSUMBarKit(
        trades=trade,
        sigma=sigma_vector, # 1d np.ndarray 或常數
        sigma_floor=min_sigma,   # 最小 sigma（可視價格單位或 log-return 單位調）
        sigma_mult=sigma_mult      # lambda_th = sigma_mult * max(sigma, sigma_floor)
    )
    
    return cusum_bar_kit

def concate_finmlkit(list_of_obj_dict, name):
    ohlcv_dict = {}
    direction_dict = {}
    size_dict = {}

    for code, obj_list in list_of_obj_dict.items():
        ohlcv_list, direction_list, size_list = [], [], []
        for item in obj_list:
            ohlcv_list.append(item.build_ohlcv())
            direction_list.append(item.build_directional_features())
            n_bars = len(item._close_indices) - 1
            theta_array = np.full(n_bars, 10, dtype=np.float64)
            size_list.append(item.build_trade_size_features(theta=theta_array))
        
        # 合併各項資料
        ohlcv_df = pd.concat(ohlcv_list) if ohlcv_list else pd.DataFrame()
        direction_df = pd.concat(direction_list) if direction_list else pd.DataFrame()
        size_df = pd.concat(size_list) if size_list else pd.DataFrame()

        # 對時間索引整合
        if not ohlcv_df.empty:
            ohlcv_df = ohlcv_df.groupby(ohlcv_df.index).last().sort_index()
        if not direction_df.empty:
            direction_df = direction_df.groupby(direction_df.index).last().sort_index()
        if not size_df.empty:
            size_df = size_df.groupby(size_df.index).last().sort_index()

        # 存進字典
        ohlcv_dict[code] = ohlcv_df.add_suffix(f"_{name}_{code}")
        direction_dict[code] = direction_df.add_suffix(f"_{name}_{code}")
        size_dict[code] = size_df.add_suffix(f"_{name}_{code}")

    # === 對齊多商品資料 === #
    def merge_all(dfs_dict):
        non_empty = [df for df in dfs_dict.values() if not df.empty]
        if not non_empty:
            return pd.DataFrame()
        merged = pd.concat(non_empty, axis=1)
        merged = merged.sort_index().ffill().dropna()
        return merged

    df_kbar = merge_all(ohlcv_dict)
    df_direction = merge_all(direction_dict)
    df_size = merge_all(size_dict)

    return df_kbar, df_direction, df_size

def pre_process_by_range(df, range_size=0.005):
    df = df.copy()
    df['ts'] = pd.to_datetime(df['ts'])
    df = df.sort_values('ts').reset_index(drop=True)
    df.rename(columns={'close': 'tick_close'}, inplace=True)
    
    # 計算買賣量
    df['buy_volume'] = np.where(df['tick_type'] == 1, df['volume'], 0)
    df['sell_volume'] = np.where(df['tick_type'] == 2, df['volume'], 0)

    # 初始化 bar_id
    prices = df['tick_close'].values
    n = len(df)
    bar_ids = np.zeros(n, dtype=int)
    current_bar_id = 0
    bar_open = prices[0]
    upper_bound = bar_open * (1 + range_size)
    lower_bound = bar_open * (1 - range_size)

    # 向量化生成 bar_id
    for i in range(1, n):
        price = prices[i]
        if price >= upper_bound or price <= lower_bound:
            current_bar_id += 1
            bar_open = price
            upper_bound = bar_open * (1 + range_size)
            lower_bound = bar_open * (1 - range_size)
        bar_ids[i] = current_bar_id

    df['bar_id'] = bar_ids

    # 按 bar_id 聚合，生成 K 線數據
    bar_stats = df.groupby('bar_id').agg(
        ts_range=('ts', 'first'),                    # K線開始時間
        open_range=('tick_close', 'first'),          # 開盤
        high_range=('tick_close', 'max'),            # 最高
        low_range=('tick_close', 'min'),             # 最低
        close_range=('tick_close', 'last'),          # 收盤
        volume_range=('volume', 'sum'),              # 總成交量
        buy_volume_range=('buy_volume', 'sum'),      # 主動買量
        sell_volume_range=('sell_volume', 'sum')     # 主動賣量
    ).reset_index()
    
    # 計算 order_imbalance
    total_vol = bar_stats['buy_volume_range'] + bar_stats['sell_volume_range']
    bar_stats['order_imbalance_range'] = np.where(
        total_vol > 0,
        (bar_stats['buy_volume_range'] - bar_stats['sell_volume_range']) / total_vol,
        0
    )

    # 設定 ts_range 為 index
    bar_stats = bar_stats.set_index('ts_range')

    # 最終欄位順序（保留 bar_id）
    final_columns = [
        'bar_id',
        'open_range', 'high_range', 'low_range', 'close_range',
        'volume_range', 'buy_volume_range', 'sell_volume_range',
        'order_imbalance_range'
    ]
    
    return bar_stats[final_columns]

def pre_process_by_renko(df, brick_size=0.5, renko_type='normal'):
    df = df.set_index('ts')
    r = Renko(df, brick_size=brick_size)
    rdf = r.renko_df(renko_type)
    rdf = rdf.rename(columns={
        'open': 'open_renko',
        'high': 'high_renko',
        'low': 'low_renko',
        'close': 'close_renko',
        'volume': 'volume_renko'
    })
    return rdf

def pre_process_by_second(df1, data_type, date_start, date_end):
    # 假设你的df已经加载到dataframe
    df1['ts'] = pd.to_datetime(df1['ts'])  # 将ts列转换为datetime类型

    # 按秒分组，使用 named aggregation
    df1 = df1.groupby(df1['ts'].dt.floor('s')).agg(
        close=('close', 'last'),
        volume=('volume', 'sum'), # 總成交量
        close_list=('close', list),
        volume_list=('volume', list), # 成交量列表
        bid_price=('bid_price', lambda x: tuple(sorted(filter(lambda price: price != 0, x)))),
        ask_price=('ask_price', lambda x: tuple(sorted(filter(lambda price: price != 0, x)))),
        bid_list=('bid_price', list),
        ask_list=('ask_price', list),
        bid_vol_list=('bid_volume', list),
        ask_vol_list=('ask_volume', list),
        tick_type=('tick_type', lambda x: [1 if t == 1 else -1 if t == 2 else 0 for t in x])
    ).reset_index()
    
    # 分析每秒的內外盤成交量
    df1['flow_imbalance'] = df1.apply(lambda row: analyze_tick_types(row['tick_type'], row['volume_list']), axis=1)
    liquidity_features = df1.apply(lambda row: calculate_liquidity_factors(row['bid_list'], row['ask_list'], row['bid_vol_list'], row['ask_vol_list'], row['volume_list']), axis=1)
    df1 = pd.concat([df1, liquidity_features], axis=1)
    
    price_features = df1.apply(lambda row: calculate_price_factors(row['close_list'], row['volume_list']), axis=1)
    df1 = pd.concat([df1, price_features], axis=1)
    
    capital_features = df1.apply(lambda row: calculate_capital_factors(row['bid_list'], row['ask_list'], row['bid_vol_list'], row['ask_vol_list']), axis=1)
    df1 = pd.concat([df1, capital_features], axis=1)

    df1.drop(columns=['tick_type', 'volume_list', 'bid_list', 'ask_list', 'bid_vol_list', 'ask_vol_list', 'close_list'], inplace=True)

    # 创建时间范围从開始到結束天數（或多个天数）
    time_range = pd.date_range(date_start, date_end, freq='s')

    # 将时间范围转换为DataFrame
    full_time_df = pd.DataFrame(time_range, columns=['ts'])
    
    if data_type == 's_day':
        # 通过检查时间是否在9:00:00到13:30:00之间来剔除跨天的数据
        valid_time_range = full_time_df['ts'].dt.time.between(pd.to_datetime('09:00:00').time(), pd.to_datetime('13:30:00').time())
        valid_time = full_time_df[valid_time_range]
        
    elif data_type == 'f_day':
        # 通过检查时间是否在9:00:00到13:30:00之间来剔除跨天的数据
        valid_time_range = full_time_df['ts'].dt.time.between(pd.to_datetime('08:45:00').time(), pd.to_datetime('13:45:00').time())
        valid_time = full_time_df[valid_time_range]
        
    elif data_type == 'f_night':
        t1 = pd.to_datetime('08:45:00').time()
        t2 = pd.to_datetime('13:45:00').time()
        t3 = pd.to_datetime('15:00:00').time()
        t4 = pd.to_datetime('05:00:00').time()
        
        # 提取時間部分並進行向量化比較
        time_series = full_time_df['ts'].dt.time
        valid_time = full_time_df[
            ((time_series >= t1) & (time_series <= t2)) |  # 日盤時間
            ((time_series >= t3) | (time_series <= t4))   # 夜盤時間
        ]

    # 合并df1和df2的结果，确保它们与mer_ori_data按秒对齐, 首先将df1和df2与mer_ori_data合并，使用'left'连接方式，以保留所有有效时间
    mer_ori_data = pd.merge(valid_time, df1, on='ts', how='left')

    # 设置'ts'为index
    mer_ori_data.set_index('ts', inplace=True)
    mer_ori_data = mer_ori_data.dropna()
    
    extra_df = mer_ori_data.resample('1min').agg({
        'flow_imbalance': 'sum',
        'avg_spread': 'sum',
        'avg_obi': 'sum',
        'sum_price_volume': 'sum',
        'sum_price_sq_volume': 'sum',
        'monetary_delta': 'sum',
        'volume_delta': 'sum',
        'pv_list': aggregate_pv_list
    })
    
    # 為了配合合併分K, 需把時間+1分鐘
    extra_df.index = extra_df.index + pd.Timedelta(minutes=1)
    
    # 過濾掉 bid_price 或 ask_price 為空 tuple 的行 (漲停或跌停)
    mer_ori_data = mer_ori_data[(mer_ori_data['bid_price'].map(len) > 0) & (mer_ori_data['ask_price'].map(len) > 0)]
    
    return mer_ori_data.dropna(), extra_df.dropna()

def convert_ohlcv(df, freq=60):
    # 建立 session_type 與 session_start
    t1 = datetime.strptime("08:45", "%H:%M").time()
    t2 = datetime.strptime("13:45", "%H:%M").time()
    t3 = datetime.strptime("05:00", "%H:%M").time()
    t4 = datetime.strptime("15:00", "%H:%M").time()
    
    def classify_session(timestamps):
        try:
            # 1. 使用 .dt.tz 來檢查時區信息
            timestamps = timestamps.dt.tz_localize(None) if timestamps.dt.tz is not None else timestamps
            
            # 2. 使用 .dt.time 和 .dt.date 來提取時間和日期部分
            times = timestamps.dt.time
            dates = timestamps.dt.date

            # 初始化結果
            session_type = pd.Series("other", index=timestamps.index)
            session_start = pd.Series(pd.NaT, index=timestamps.index)
        
            # 日盤條件
            day_mask = (times >= t1) & (times <= t2)
            session_type.loc[day_mask] = "day"
            session_start.loc[day_mask] = pd.to_datetime(
                dates[day_mask].astype(str) + " " + t1.strftime("%H:%M:%S")
            )
    
            # 夜盤條件 (當天夜盤)
            night_mask1 = (times >= t4) & ~day_mask
            session_type.loc[night_mask1] = "night"
            session_start.loc[night_mask1] = pd.to_datetime(
                dates[night_mask1].astype(str) + " " + t4.strftime("%H:%M:%S")
            )
        
            # 夜盤條件 (前一天夜盤，時間 < t4)
            night_mask2 = (times < t4) & ~day_mask
            session_type.loc[night_mask2] = "night"
            prev_dates = (timestamps[night_mask2] - timedelta(days=1)).dt.date
            session_start.loc[night_mask2] = pd.to_datetime(
                prev_dates.astype(str) + " " + t4.strftime("%H:%M:%S")
            )

            return pd.DataFrame({"session_type": session_type, "session_start": session_start})

        except Exception as e:
            print(f"時間轉換出現錯誤: {e}")
            return pd.DataFrame({"session_type": None, "session_start": None}, index=timestamps.index)
    
    result = classify_session(df.index.to_series())
    df.loc[:, ["session_type", "session_start"]] = result[["session_type", "session_start"]]
    df = df[df["session_type"].isin(["day", "night"])]

    # 新增4: 補齊缺失的1分K資料
    def fill_missing_minutes(df_session, session_start, session_type):
        # 定義交易時段範圍
        if session_type == "day":
            start_time = datetime.combine(session_start.date(), t1)
            end_time = datetime.combine(session_start.date(), t2)
        else:  # night
            start_time = datetime.combine(session_start.date(), t4)
            end_time = datetime.combine(session_start.date() + timedelta(days=1), t3)

        # 生成完整的1分鐘時間序列
        full_time_index = pd.date_range(start=start_time, end=end_time, freq="1min")
        existing_times = df_session.index

        # 找出缺失的時間點
        missing_times = [t for t in full_time_index if t not in existing_times]
        
        if missing_times:
            # 為每個缺失時間點填充資料
            missing_data = []
            last_valid_row = None
            for t in missing_times:
                # 找到前一筆有效資料
                prev_time = t - timedelta(minutes=1)
                if prev_time in df_session.index:
                    last_valid_row = df_session.loc[prev_time]
                if last_valid_row is not None:
                    missing_data.append({
                        "ts": t,
                        "open": last_valid_row["close"],
                        "high": last_valid_row["close"],
                        "low": last_valid_row["close"],
                        "close": last_valid_row["close"],
                        "volume": 0,
                        "amount": 0,
                        "complete": True,
                        "session_type": session_type,
                        "session_start": session_start
                    })

            # 將缺失資料合併到原資料
            if missing_data:
                missing_df = pd.DataFrame(missing_data).set_index("ts")
                df_session = pd.concat([df_session, missing_df]).sort_index()

        return df_session

    # 按 session 分組並補齊缺失資料
    df_filled = []
    for session_start, session_data in df.groupby("session_start"):
        session_type = session_data["session_type"].iloc[0]
        session_data = fill_missing_minutes(session_data, session_start, session_type)
        df_filled.append(session_data)
        
    if df_filled:
        df = pd.concat(df_filled).sort_index()

    df.index = df.index - pd.Timedelta(minutes=1)

    # 設定 K 棒時間長度
    window = timedelta(minutes=freq)

    # 分段處理每個 session 的資料
    result = []

    for session_start, session_data in df.groupby("session_start"):
        current_time = session_start
        max_time = session_data.index.max()
        
        # 对 session 进行累积计算
        if not session_data.empty:
            session_data["acc_vol"] = session_data["volume"].cumsum()  # 在 session 内累积 volume
            session_data["acc_price_volume"] = session_data["sum_price_volume"].cumsum()  # 累积 sum_price_volume
            session_data["acc_price_sq_volume"] = session_data["sum_price_sq_volume"].cumsum()  # 累积 sum_price_sq_volume

        while current_time < max_time:
            next_time = current_time + window
            window_data = session_data[(session_data.index >= current_time) & (session_data.index < next_time)]

            if not window_data.empty:
                o = window_data["open"].iloc[0]
                h = window_data["high"].max()
                l = window_data["low"].min()
                c = window_data["close"].iloc[-1]
                v = window_data["volume"].sum()
                a = c * v
                complete = window_data.index[-1] >= next_time - timedelta(minutes=1)
                
                # 添加额外的列聚合
                flow_imbalance_agg = window_data['flow_imbalance'].sum()
                avg_spread_agg = window_data['avg_spread'].sum() / v if v != 0 else 0
                avg_obi_agg = window_data['avg_obi'].sum() / v if v != 0 else 0.5
                sum_price_volume = window_data['sum_price_volume'].sum()
                sum_price_sq_volume = window_data['sum_price_sq_volume'].sum()
                monetary_delta_agg = window_data['monetary_delta'].sum()
                volume_delta_agg = window_data['volume_delta'].sum()
                pv_list_agg = aggregate_pv_list(window_data['pv_list'])
                acc_vol = window_data["acc_vol"].sum()
                acc_price_vol = window_data["acc_price_volume"].sum()
                acc_price_sq_volume = window_data["acc_price_sq_volume"].sum()

                result.append({
                    "ts": current_time,
                    "open": o,
                    "high": h,
                    "low": l,
                    "close": c,
                    "volume": v,
                    "amount": a,
                    "flow_imbalance": flow_imbalance_agg,
                    "avg_spread": avg_spread_agg,
                    "avg_obi": avg_obi_agg,
                    "sum_price_volume": sum_price_volume,
                    "sum_price_sq_volume": sum_price_sq_volume,
                    "monetary_delta": monetary_delta_agg,
                    "volume_delta": volume_delta_agg,
                    "pv_list": pv_list_agg,
                    "acc_volume": acc_vol,
                    "acc_price_volume": acc_price_vol,
                    "acc_price_sq_volume": acc_price_sq_volume,
                    "complete": complete
                })

            current_time = next_time
            
    # 建立新的 DataFrame
    agg_df = pd.DataFrame(result)
    agg_df.set_index("ts", inplace=True, drop=False)
    agg_df = agg_df.shift(1).dropna()

    return agg_df

def combine_daily_k_bars(df):
    df = df.copy()
    df.index = pd.to_datetime(df.index)
    df['date'] = df.index.date
    df['time'] = df.index.time
    
    combined_data = []
    
    for date, group in df.groupby('date'):
        aggregated_row = {
            'date': date,
            'open': group['open'].iloc[0],
            'high': group['high'].max(),
            'low': group['low'].min(),
            'close': group['close'].iloc[-1],
            'volume': group['volume'].sum(),
            'complete': True,
            'flow_imbalance': group['flow_imbalance'].sum(),
            'monetary_delta': group['monetary_delta'].sum(),
            'volume_delta': group['volume_delta'].sum(),
            'acc_price_volume': group['acc_price_volume'].iloc[-1],
            'acc_price_sq_volume': group['acc_price_sq_volume'].iloc[-1],
            'acc_volume': group['acc_volume'].iloc[-1],
            'amount': group['amount'].sum()
        }
        
        combined_data.append(aggregated_row)
    
    # 创建结果DataFrame
    combined = pd.DataFrame(combined_data)
    combined['ts'] = pd.to_datetime(combined['date'])
    combined = combined.set_index('ts')
    combined = combined.drop('date', axis=1)
    
    return combined.shift(1).dropna()

def process_multiple_datasets_by_second(dataset1, dataset2, codes):
    dfs_by_code = {code: [] for code in codes}
    dfk_by_code = {code: [] for code in codes}
    extra_by_code = {code: [] for code in codes}
    
    # 處理 tick 資料
    for type_of_data, data_name, date_start, date_end, data_type, *params in dataset1:
        origin_df = prepare_data(type_of_data, data_name)
        df, extra_data = pre_process_by_second(origin_df, data_type, date_start, date_end)
        
        for code in codes:
            if data_name.startswith(code):
                dfs_by_code[code].append(df)
                extra_by_code[code].append(extra_data)
                break

    # 處理 1分K 資料
    for type_of_data, data_name in dataset2:
        df = prepare_data(type_of_data, data_name).set_index('ts')
        
        for code in codes:
            if data_name.startswith(code + 'k'):
                dfk_by_code[code].append(df)
                break

    # 合併各商品的 dataframe
    final_dfs = {}
    final_dfks = {}

    for code in codes:
        # 合併同一商品的多個資料段
        df = pd.concat(dfs_by_code[code]) if dfs_by_code[code] else pd.DataFrame()
        dfk = pd.concat(dfk_by_code[code]) if dfk_by_code[code] else pd.DataFrame()
        extra = pd.concat(extra_by_code[code]) if extra_by_code[code] else pd.DataFrame()

        # groupby 處理重複時間
        if not df.empty:
            df = df.groupby(df.index).last().sort_index().dropna()
        if not dfk.empty:
            dfk = dfk.groupby(dfk.index).last().sort_index().dropna()

        # 1k 資料加上 extra 資料（若存在）
        if not dfk.empty and not extra.empty:
            dfk = pd.merge(dfk, extra, left_index=True, right_index=True, how="inner")

        final_dfs[code] = df
        final_dfks[code] = dfk

    # === 合併所有商品成一張 df（依 index inner join）===
    merged_df = None
    for code in codes:
        if final_dfs[code].empty:
            continue
        if merged_df is None:
            merged_df = final_dfs[code].copy()
            merged_df = merged_df.add_suffix(f'_{code}')
        else:
            df_tmp = final_dfs[code].add_suffix(f'_{code}')
            merged_df = pd.merge(merged_df, df_tmp, left_index=True, right_index=True, how='inner')

    return merged_df, final_dfks

def process_multiple_datasets_by_custom(dataset1, codes):
    # 建立每個 bar 類型的 dict
    bar_types = ["tick", "vol", "time", "sum", "dollar", "range", "renko"]
    bar_dict = {btype: {code: [] for code in codes} for btype in bar_types}

    # 處理 tick 資料
    for type_of_data, data_name, _, _, _, tick_bar, volume_bar, time_bar, sum_bar, dollar_bar, range_bar, renko_bar in dataset1:
        origin_df = prepare_data(type_of_data, data_name)

        tick_data   = pre_process_by_finmlkit_tickbar(origin_df, tick_bar.get('n', 50))
        volume_data = pre_process_by_finmlkit_volbar(origin_df, volume_bar.get('volume_threshold', 1000))
        time_data   = pre_process_by_finmlkit_timebar(origin_df, time_bar.get('time', '1min'))
        sum_data    = pre_process_by_finmlkit_sumbar(origin_df, sum_bar.get('window', 200), sum_bar.get('min_sigma', 5e-4), sum_bar.get('sigma_mult', 2))
        dollar_data = pre_process_by_finmlkit_dollarbar(origin_df, dollar_bar.get('dollar_threshold', 1_000_000))
        range_data  = pre_process_by_range(origin_df, range_bar.get('range_size', 0.005))
        renko_data  = pre_process_by_renko(origin_df, renko_bar.get('brick_size', 0.5), renko_bar.get('type', 'normal'))

        for code in codes:
            if data_name.startswith(code):
                bar_dict["tick"][code].append(tick_data)
                bar_dict["vol"][code].append(volume_data)
                bar_dict["time"][code].append(time_data)
                bar_dict["sum"][code].append(sum_data)
                bar_dict["dollar"][code].append(dollar_data)
                bar_dict["range"][code].append(range_data)
                bar_dict["renko"][code].append(renko_data)
                break

    # === 這裡不 concat，只是簡單複製 === #
    final_bars = bar_dict

    # === 合併 range / renko 成 DataFrame === #
    def merge_multi_bars(bar_data_dict):
        non_empty = [(code, df) for code, df in bar_data_dict.items() if not isinstance(df, list) and not df.empty]
        if not non_empty:
            return pd.DataFrame()
        merged = reduce(
            lambda left, right: pd.merge(
                left[1].add_suffix(f"_{left[0]}"),
                right[1].add_suffix(f"_{right[0]}"),
                left_index=True,
                right_index=True,
                how='outer'
            ),
            non_empty
        )
        merged = merged.sort_index().ffill().dropna()
        return merged

    # 因為 range/renko 的 bar 是 DataFrame（不是 finmlkit 物件）
    df_rangebar = merge_multi_bars(final_bars["range"])
    df_renkobar = merge_multi_bars(final_bars["renko"])

    return df_rangebar, df_renkobar, final_bars

CODE = ['2349', '3050', '8104']
session = 's_day'
params1 = {
    'tick_bar': {'n': 120},
    'volume_bar': {'volume_threshold': 150},
    'time_bar': {'time': '60min'},
    'sum_bar': {'window': 200, 'min_sigma': 7.656013e-05, 'sigma_mult': 3.791},
    'dollar_bar': {'dollar_threshold': 100000},
    'range_bar': {'range_size': 0.007},
    'renko_bar': {'brick_size': 1}
}

params2 = {
    'tick_bar': {'n': 5},
    'volume_bar': {'volume_threshold': 10},
    'time_bar': {'time': '60min'},
    'sum_bar': {'window': 200, 'min_sigma': 4.003475e-04, 'sigma_mult': 4.061},
    'dollar_bar': {'dollar_threshold': 35000},
    'range_bar': {'range_size': 0.005},
    'renko_bar': {'brick_size': 2.5}
}

params3 = {
    'tick_bar': {'n': 5},
    'volume_bar': {'volume_threshold': 10},
    'time_bar': {'time': '60min'},
    'sum_bar': {'window': 200, 'min_sigma': 4.003475e-04, 'sigma_mult': 4.061},
    'dollar_bar': {'dollar_threshold': 35000},
    'range_bar': {'range_size': 0.005},
    'renko_bar': {'brick_size': 2.5}
}

dataset1 = [
    ('shioaji/2024_0111', f'{CODE[0]}', '2024-01-02 9:00:00', '2025-11-10 13:30:00', f'{session}', *params1.values()),
    ('shioaji/2024_0111', f'{CODE[1]}', '2024-01-02 9:00:00', '2025-11-10 13:30:00', f'{session}', *params2.values()),
    ('shioaji/2024_0111', f'{CODE[2]}', '2024-01-02 9:00:00', '2025-11-10 13:30:00', f'{session}', *params3.values()),
]

dataset2 = [
    ('shioaji/2024_0111', f'{CODE[0]}k'),
    ('shioaji/2024_0111', f'{CODE[1]}k'),
    ('shioaji/2024_0111', f'{CODE[2]}k'),
]

# 放入 (貴的, 便宜的), 數值大的在ratio分母, 使得數值可以相除在0~1
df, dfk = process_multiple_datasets_by_second(dataset1, dataset2, CODE)
#df_range, df_renko, final_bars = process_multiple_datasets_by_custom(dataset1, CODE)
#df_tick_k, df_tick_direct, df_tick_size = concate_finmlkit(final_bars["tick"], 'tick')
#df_vol_k, df_vol_direct, df_vol_size = concate_finmlkit(final_bars["vol"], 'volume')
#df_time_k, df_time_direct, df_time_size = concate_finmlkit(final_bars["time"], 'time')
#df_sum_k, df_sum_direct, df_sum_size = concate_finmlkit(final_bars["sum"], 'sum')
#df_dollar_k, df_dollar_direct, df_dollar_size = concate_finmlkit(final_bars["dollar"], 'dollar')

print("Index:", df.head(3).index.tolist())
print("Columns:", df.columns.tolist())

[1;33mThank you for using SmartMoneyConcepts! ⭐ Please show your support by giving a star on the GitHub repository: [4;34mhttps://github.com/joshyattridge/smart-money-concepts[0m
Index: [Timestamp('2024-01-02 13:30:00'), Timestamp('2024-01-03 13:30:00'), Timestamp('2024-01-04 13:30:00')]
Columns: ['close_2349', 'volume_2349', 'bid_price_2349', 'ask_price_2349', 'flow_imbalance_2349', 'avg_spread_2349', 'avg_obi_2349', 'sum_price_volume_2349', 'sum_price_sq_volume_2349', 'pv_list_2349', 'monetary_delta_2349', 'volume_delta_2349', 'close_3050', 'volume_3050', 'bid_price_3050', 'ask_price_3050', 'flow_imbalance_3050', 'avg_spread_3050', 'avg_obi_3050', 'sum_price_volume_3050', 'sum_price_sq_volume_3050', 'pv_list_3050', 'monetary_delta_3050', 'volume_delta_3050', 'close_8104', 'volume_8104', 'bid_price_8104', 'ask_price_8104', 'flow_imbalance_8104', 'avg_spread_8104', 'avg_obi_8104', 'sum_price_volume_8104', 'sum_price_sq_volume_8104', 'pv_list_8104', 'monetary_delta_8104', 'volume_del

In [3]:
K_time = 1440
k = {code: combine_daily_k_bars(convert_ohlcv(dfk[code], K_time)) for code in CODE}

dfs = []
for code, df in k.items():
    df_renamed = df.copy()
    df_renamed.columns = [f"{col}_{code}" for col in df_renamed.columns]
    
    # 取出需要的原始序列
    close = df_renamed[f'close_{code}']
    volume = df_renamed[f'volume_{code}']
    
    # 累積計算
    acc_vol = volume.cumsum() # ΣV
    acc_pv = (close * volume).cumsum() # Σ(P×V)
    acc_p2v = (close**2 * volume).cumsum() # Σ(P²×V)

    vwap = acc_pv / acc_vol
    vwap_variance = acc_p2v / acc_vol - vwap**2
    vwap_std = np.sqrt(vwap_variance.clip(lower=0))
    
    df_renamed[f'volume_acc_{code}'] = acc_vol
    df_renamed[f'vwap_{code}'] = vwap
    df_renamed[f'vwap_std_{code}'] = vwap_std
    
    dfs.append(df_renamed)

df_all = pd.concat(dfs, axis=1)

In [4]:
def circle_omega(series):
    # 想知道「現在量是在升還是降」 → 看 Y
    # 想知道「變化速度快不快」 → 看 ω
    # 想知道「是不是即將轉折」 → 看 κ
    # 想知道「循環位置」 → 看 θ

    ma = talib.EMA(series, params.get('indicator_param')).fillna(0)
    X = ma - ma.mean() # 位移
    Y = ma.diff().fillna(0) # 速度
 
    # 3. 計算相位角 θ(t)
    theta = np.arctan2(Y, X)

    # 4. 角速度 ω(t) = dθ/dt
    dtheta = np.diff(theta)
    dtheta = np.unwrap(dtheta) # 避免跳角度
    omega = pd.Series(dtheta)

    # 5. 曲率 κ（反映「加速度變化」）
    V = np.sqrt(X**2 + Y**2) # "速度向量大小"（非價格漲跌）
    dX = X.diff().fillna(0)
    dY = Y.diff().fillna(0)

    # 平面曲率公式 κ = |x'y'' – y'x''| / ( (x'^2 + y'^2)^(3/2) )
    curvature_num = (Y.iloc[-2] * dX.iloc[-1] - X.iloc[-2] * dY.iloc[-1])
    curvature_den = ((Y.iloc[-2]**2 + X.iloc[-2]**2)**1.5 + 1e-9)
    curvature = curvature_num / curvature_den
    
    return {
        "X": X.iloc[-1],
        "Y": Y.iloc[-1],
        "theta": theta.iloc[-1], # 當前相位角
        "omega": omega.iloc[-1], # 角速度
        "curvature": curvature # 曲率（加速度變化）
    }

def bias_ratio(series, period):
    ma = talib.EMA(series, period)
    bias_ratio = (series - ma) / ma * 100
    return bias_ratio.iloc[-1]

def corr_signal(df_all, **params):
    k_lookback = params.get('k_lookback')
    indicator = params.get('indicator')
    col = params.get('col')
    codes = params.get('codes')
    results = []
       
    for i in range(k_lookback - 1):
        results.append({'ts': df_all.index[i], **{code: np.nan for code in codes}})
        
    for i in range(0, len(df_all) - k_lookback + 1):
        if i + k_lookback > len(df_all):
            break
          
        df_subset = df_all.iloc[i:i + k_lookback].copy()
        
        indicators = {}
        for code in codes:
            if indicator == 'bias':
                series = df_subset[f'{col}_{code}'].dropna()
                indicators[code] = bias_ratio(series, params.get('indicator_param'))
            
            if indicator == 'ov':
                series = df_subset[f'{col}_{code}']
                indicators[code] = circle_omega(series)['theta']
                
        score_row = dict(indicators)
        score_row['ts'] = df_subset.index[-1]
        results.append(score_row)
    
    result_df = pd.DataFrame(results).set_index('ts')
    return result_df.dropna()

params = {
    'k_lookback': 20,
    'col': 'vwap',
    'indicator_param': 5,
    'indicator': 'bias',
    'codes': CODE
}
bias = corr_signal(df_all, **params)

params = {
    'k_lookback': 20,
    'col': 'volume_acc',
    'indicator_param': 5,
    'indicator': 'ov',
    'codes': CODE
}
ov = corr_signal(df_all, **params)

# 1. 加上後綴區分
bias_renamed = bias.add_suffix('_bias')
ov_renamed = ov.add_suffix('_ov')

# 2. 合併
df_merged = pd.concat([bias_renamed, ov_renamed], axis=1)
df_final = pd.concat([df_all, df_merged], axis=1, join='inner')

In [8]:
import json, os, random, re, textwrap
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

JSON_FILE = f'./history.json'

llm = ChatOpenAI(
    base_url="http://localhost:8000/v1",  # 你的本地服務器地址
    api_key="sk-no-key-required",  # 如果不需要 API key，用占位符
    model="meta-llama/Llama-3.3-70B-Instruct",  # 你的 Hugging Face 模型名稱
    temperature=0  # 根據需求調整
)

system_prompt_step1 = textwrap.dedent("""
    你是一位專業的量化分析師，任務是分析多個商品的時間序列數據，判斷每個商品在分析期內的盤勢狀況。
    對每一個輸入的商品，**完全獨立、逐筆進行深度分析**，嚴禁將任何一筆商品的走勢套用到其他商品，即使形態類似，也必須指出差異。

    【數據說明】
    - 每個商品的數據是時間序列(由左到右表示從舊到新)
    - 指標包含：Bias(乖離率), OV(成交量偏離), VWAP_STD(VWAP標準差)
    
    【盤勢判斷規則】
    以下分類需根據Bias, OV, VWAP_STD 的數值變化判斷，而不是只看單一點
    1. **弱勢震盪**
       數值小幅度變動, Bias, OV, VWAP_STD數值幾乎沒變化。
       舉例如下：
        - Bias: [0.02,0.05,0.10,0.10,0.12,0.10,0.09,0.08,0.08,0.08,0.08,0.06,0.13,0.14,0.10,0.08,0.06,0.04,0.03,0.02,0.05]
        - OV: [0.02,0.03,0.03,0.03,0.03,0.03,0.02,0.02,0.02,0.02,0.03,0.03,0.04,0.04,0.03,0.03,0.02,0.02,0.01,0.01,0.02]
        - VWAP_STD: [2.53,2.52,2.52,2.52,2.51,2.50,2.50,2.50,2.50,2.49,2.48,2.48,2.47,2.46,2.46,2.45,2.45,2.45,2.45,2.44,2.43]
    
    2. **無量上漲 / 無量下跌**
       Bias和OV數值會從大數值變小後, Bias沒什麼變化但OV持續變小, 但VWAP_STD數值有持續的再上升沒回落。
       舉例如下：
        - Bias: [0.92,0.49,0.31,0.26,0.36,0.47,0.37,0.34,0.29,0.23,0.18,0.45]
        - OV: [0.23,0.18,0.12,0.11,0.11,0.11,0.08,0.07,0.07,0.05,0.05,0.08]
        - VWAP_STD: [0.59,0.59,0.59,0.58,0.60,0.63,0.63,0.65,0.65,0.65,0.65,0.73]
    
    4. **多頭趨勢**
       Bias和VWAP_STD數值持續上升, OV可能會有些許的回落但也會比相對初期的數值還大。
       舉例如下：
        - Bias: [0.13,0.14,0.35,2.77,2.86,2.76,4.47,5.45,8.97,8.87]
        - OV: [0.07,0.07,0.09,0.31,0.28,0.23,0.27,0.24,0.28,0.25]
        - VWAP_STD: [1.44,1.44,1.54,2.80,3.01,3.23,4.31,5.62,8.09,8.89]
    
    5. **多頭趨勢後期**
       Bias數值出現最大後會開始縮小, OV也開始轉小, VWAP_STD可能還持續維持在高數值。
       舉例如下：
        - Bias: [2.76,4.47,5.45,8.97,8.87,6.09]
        - OV: [0.23,0.27,0.24,0.28,0.25,0.17]
        - VWAP_STD: [3.23,4.31,5.62,8.09,8.89,8.89]

    10. **震盪整理**
       當Bias, OV, VWAP_STD曾經經歷過數值大幅度上升後, 數值都開始變小。
       舉例如下：
        - Bias: [0.66,0.68,0.72,1.48,3.19,2.86,2.27,1.68,1.18,0.85]
        - OV: [0.08,0.08,0.08,0.10,0.16,0.14,0.12,0.10,0.08,0.08]
        - VWAP_STD: [1.00,1.06,1.15,1.62,2.43,2.57,2.59,2.57,2.55,2.51]
    
    【輸出格式】
    使用上述規定的盤勢種類, 依照對每個商品的盤勢分析結果, 嚴格按照以下格式純文字輸出, 不要帶有json或其他格式（所有商品都要回覆）：
        [商品分析]
        商品xxx: 分析結果
        商品xxx: 分析結果
        商品xxx: 分析結果
        ...

        [總結]
        簡述整體市場狀況，並把相同走勢的商品彙整，例如：
        「商品xxx與商品yyy皆為震盪」

    【重要要求】
    1. 必須分析所有輸入的商品，不得遺漏。
    2. 並且分析到最新時刻(最右邊)。
    3. 必須綜合考慮 Bias, OV, VWAP_STD 三個指標。
    4. 判斷需依照整段時間序列的整體結構，不可只根據單點。
    5. 不需要對於指標數值的分析, 明確表示截至當前時刻商品屬於什麼盤勢。
""").strip()

system_prompt_step2 = textwrap.dedent("""
    你是一位頂尖的量化配對交易員，你的任務是從當前給予的時間序列資料和當前給予的盤勢變化, 以及給予整個族群的商品組, 進行分析並判斷誰相對領漲誰相對弱後。
    
    【限制】:
    1. **僅使用輸入數據**：
        - 只能使用輸入數據中提供的資料如K, Volume等。
        - **禁止引入任何非輸入數據中提供的指標或策略方法**。
        - 分析語言必須完全基於數據，不得包含策略、理論或技術分析方法描述。
    2. **你的所有分析都必須明確引用輸入數據中的「時間範圍」，將你的發現錨定在具體的時間上下文中, 並且在執行分析的時候, 必須使用分析期時間範圍作為時間軸, 審視時間上的輸入數據的變化。**
    3. **每組商品的資料都是由左至右表示時間由舊到新
    
    【數據結構定義】:
    數據格式如下:
    **分析期 (time_window)**: 時間範圍 (time_window), 當前所有數據的時間序列都基於此時間段, 你的所有分析都基於這個時期的數據變化。
    **舉例**:
        分析時間段為: 2024-01-31 00:00:00 to 2024-02-05 00:00:00
        - K (Close): [7.86, 7.83, 7.8, 7.81, 7.82, 7.78]
        - Volume: [619.0,704.0,779.0,499.0,539.0,687]
        
        則代表K的時間序列為從01/31至02/05為
                K       Volume
        01/31   7.86    619
        02/01   7.83    704
        02/02   7.8     779
        02/03   7.81    499
        02/04   7.82    539
        02/05   7.78    687
    **分析期時間範圍就會對應到時間序列陣列中逗號隔開的每個數值
    
    【任務】：
    **分析當前盤勢以及判斷當前的所有商品組內, 哪個商品有相對領漲的情況, 並由高到低排序出當前輸入的商品組的走勢誰高誰低**
    **尤其特別關注Volume的變化, 他是整體動能的關鍵**
    
    【請遵守以下規則】：
    **能量排序**
        **任務**: 先對當前的時間範圍內依照盤勢結果, 判斷當前是要使用哪種類型的配對交易, 再接著進行價格能量判斷, 誰領漲或誰領跌, 給予出一個商品之間的能量排序, 流程如下。
        a. **交易類型**:
            **觀察期**:
            - 使用時機: 當分析的盤勢是「多頭假趨勢，空頭假趨勢, 多頭趨勢初期, 空頭趨勢初期」, 截至目前還無法判定後續盤勢。
            - 結束時機: 當分析的盤勢是「多頭趨勢中期，空頭趨勢中期, 無量盤整價跌，無量盤整價增, 無量盤整」, 已確定當前的盤勢情形，依照後續結果決定當前行為。
        
            **動量反轉收斂型配對交易**:
            - 使用時機: 當分析的盤勢是「多頭趨勢後期，空頭趨勢後期」, 預計後續趨勢會進入後期。
            - 結束時機: 當分析的盤勢是「無量盤整價增，無量盤整價跌, 無量盤整」, 預計後續量縮將會盤整, 將在後續持續觀察, 並依照盤勢情況決定是否替換為「均值回歸發散型配對交易」。
        
            **均值回歸發散型配對交易**:
            - 使用時機: 當分析的盤勢是「無量盤整價跌，無量盤整價增, 無量盤整」, 預期價格會在一個區間內來回穿梭。
            - 結束時機: 當分析的盤勢是「多頭假趨勢，空頭假趨勢, 多頭趨勢初期, 空頭趨勢初期」, 預計後續可能會出量變趨勢, 將在後續持續觀察, 並依照盤勢情況決定是否替換為「動量反轉收斂型配對交易」。
            
        b. **商品組相對強弱排序（量價效率法，適用多頭、空頭、盤整）**
           核心原則：誰「每1張成交量換到最多價格變化」（漲最多或抗跌最多）＝當前最強。
           必須嚴格一步一步照以下格式計算，未來所有判斷都必須100%模仿下面4個完整範例。
           Step 1（決勝負，權重90%）：量價效率 = |期間漲跌幅%| ÷ 期間總成交量（保留8位小數）
           Step 2（僅差距 < 15%時看）：總漲跌幅%
           Step 3（僅差距 < 8%時看）：最近3根K的量價表現 + Bias最後值 + OV最後值
           最終排序永遠只輸出：當前商品量能排序為: [最強, 次強, 最弱]
           
           【範例1：多頭暴漲期】  
           （略）資料同你原本舉例1  
           Step 1：A 0.00018370　B 0.00008420　C 0.00028610 → C領先A 55.7%  
           最終：當前商品量能排序為: [C, A, B]
           
           【範例2：無量震盪盤整期】  
           （略）資料同你原本舉例2  
           Step 1：A 0.00001469　B 0.00019876　C 0.00006995 → B領先C 184.2%  
           最終：當前商品量能排序為: [B, C, A]
           
           【範例3：空頭暴跌期（真正抗跌者為最強）】
           商品A: K [58.0 → 42.5] 期間-26.72%　總量 1,280,000 → 效率 0.00020869  
           商品B: K [32.0 → 28.5] 期間-10.94%　總量 185,000   → 效率 0.00005914  
           商品C: K [95.0 → 68.0] 期間-28.42%　總量 980,000   → 效率 0.00028982  
           Step 1：C (0.00028982) > A (0.00020869) > B (0.00005914) （C最抗跌）  
           最終：當前商品量能排序為: [C, A, B]
           
           【範例4：空頭末段無量續跌（最不跌者為最強）】
           商品A: K [45.0 → 43.8] 期間-2.67%　總量 42,000 → 效率 0.00006357  
           商品B: K [28.0 → 24.5] 期間-12.50%　總量 68,000 → 效率 0.00018382  
           商品C: K [72.0 → 71.0] 期間-1.39%　總量 18,000 → 效率 0.00007722  
           Step 1：B用最多量才跌最多＝最弱；A與C差距小但A量更大 → A略強  
           最終：當前商品量能排序為: [A, C, B]
           
           重要提醒：
           - 多頭看誰漲最多最省力 → 最強
           - 空頭看誰跌最少最省力（或用同樣量跌最少）→ 最強
           - 盤整期同上，永遠只看「每1張換到多少價格位移」
           - 只要Step 1差距 ≥ 15%，直接按Step 1排序，Step 2/3可省略
    
    【輸出格式】：
    請針對【規則】的商品排序結果以及輸入的盤勢情形進行輸出, 格式如下:
    1. 當前時間段: (time_window)。
    2. 交易型態: 依照盤勢過程, 來輸出判斷過往的交易類型, 以及對最新時刻的盤勢判斷是否需要替換交易類型。
        - (0)觀察期不操作
        - (1)動量反轉收斂型配對交易
        - (2)均值回歸發散型配對交易
        輸出舉例: 0 -> 2 -> 0 -> 1
    
    3.能量排序: 依照【規則】的能量排序進行輸出。
        - 輸出舉例: [商品A, 商品B, ...等]
        
    請根據以上框架，對過往的數據進行盤勢分析和量能排序，確保所有結論都包含明確的時間參考, 依照輸出格式進行輸出。
    """).strip()

def clean_think_content(text: str) -> str:
    # 將 <think> 到 </think> 中間的內容整段移除
    cleaned_text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
    # 去掉前後多餘空白
    return cleaned_text.strip()

def market_trend(input_text):
    # 建立 ChatPromptTemplate
    chat_prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt_step1),
        ("human", "{input_data}")  # 使用變量占位符
    ])
    chain = chat_prompt | llm
    response = chain.invoke({"input_data": input_text})
    summary = clean_think_content(response.content.strip())
    
    return summary

def relative_strength(input_text, trend):
    # 建立 ChatPromptTemplate
    chat_prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt_step2),
        ("human", "{input_data}")  # 使用變量占位符
    ])
    chain = chat_prompt | llm
    response = chain.invoke({"input_data": input_text})
    summary = clean_think_content(response.content.strip())
    
    return summary

# 將 LLM 分析的 for loop 封裝成一個 function
def process_windows(df_all, CODE, json_file, k_lookback=10, N=5):
    if os.path.exists(json_file): # 如果 JSON 已存在，刪除它以重新開始
        print(f"找到舊檔 {json_file}，將其刪除並重新開始分析。")
        os.remove(json_file)

    # 無論如何都從一個空的 list 開始
    summaries = []
    
    for i in range(0, len(df_all) - k_lookback + 1):
        df = df_all.iloc[i:i + k_lookback].copy()
        product_info_list = []
        
        for code in CODE:
            target_cols = [f'vwap_std_{code}', f'{code}_bias', f'{code}_ov']
            df_subset = df[target_cols].copy()

            # 取得時間 (只需做一次，或在此提取)
            start_time = df_subset.index[0].strftime('%Y-%m-%d %H:%M:%S')
            end_time = df_subset.index[-1].strftime('%Y-%m-%d %H:%M:%S')
            time_window = f"{start_time} to {end_time}"

            # 數據處理：四捨五入 -> 轉列表 -> 轉字串 -> 去除空格 (Token 最佳化)
            vwap_std_str = str(df_subset[f'vwap_std_{code}'].round(2).tolist()).replace(" ", "")
            bias_str = str(df_subset[f'{code}_bias'].round(2).tolist()).replace(" ", "")
            ov_str = str(df_subset[f'{code}_ov'].round(2).tolist()).replace(" ", "")

            # --- 關鍵修改：使用明確的 \n 串接，不使用多行字串符號, 這樣可以保證 "商品" 絕對靠左，而數據絕對縮排 4 格
            product_block = (
                f"- 商品 {code}:\n"
                f"    - VWAP_STD: {vwap_std_str}\n"
                f"    - Bias: {bias_str}\n"
                f"    - OV: {ov_str}"
            )

            product_info_list.append(product_block)

        # 呼叫 LLM
        dynamic_products_content = "\n\n".join(product_info_list) # 將所有商品區塊合併，中間空一行
        print(f"正在分析窗口：{time_window}...")
        try:
            input_prompt = textwrap.dedent(f"""
                請分析以下時間窗口的數據：
                **分析期**： {start_time} to {end_time}

                **商品組**:
                {dynamic_products_content}

                請根據以上數據，使用「分析時間段分析出合適的資料」並依照規定進行輸出。
            """).strip()

            trend_raw = market_trend(input_prompt)
        except Exception as e:
            print(f"Trend tool error: {e}")
        
        # 儲存成 dict
        summaries.append({
            "time_window": time_window,
            "trend_analysis": clean_think_content(trend_raw),
            # "rs_analysis": rs_summary
        })

        # 立即 append 到 JSON (避免中斷遺失)
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(summaries, f, ensure_ascii=False, indent=4)
            
            
        
        # break
        

        print(f"完成窗口：{time_window}")

    print("\n預處理完成，所有總結存於", json_file)
    return summaries

summaries = process_windows(df_final, CODE, JSON_FILE, k_lookback=8, N=5)

正在分析窗口：2024-01-31 00:00:00 to 2024-02-20 00:00:00...
完成窗口：2024-01-31 00:00:00 to 2024-02-20 00:00:00
正在分析窗口：2024-02-01 00:00:00 to 2024-02-21 00:00:00...
完成窗口：2024-02-01 00:00:00 to 2024-02-21 00:00:00
正在分析窗口：2024-02-02 00:00:00 to 2024-02-22 00:00:00...
完成窗口：2024-02-02 00:00:00 to 2024-02-22 00:00:00
正在分析窗口：2024-02-05 00:00:00 to 2024-02-23 00:00:00...
完成窗口：2024-02-05 00:00:00 to 2024-02-23 00:00:00
正在分析窗口：2024-02-15 00:00:00 to 2024-02-26 00:00:00...
完成窗口：2024-02-15 00:00:00 to 2024-02-26 00:00:00
正在分析窗口：2024-02-16 00:00:00 to 2024-02-27 00:00:00...
完成窗口：2024-02-16 00:00:00 to 2024-02-27 00:00:00
正在分析窗口：2024-02-19 00:00:00 to 2024-02-29 00:00:00...
完成窗口：2024-02-19 00:00:00 to 2024-02-29 00:00:00
正在分析窗口：2024-02-20 00:00:00 to 2024-03-01 00:00:00...
完成窗口：2024-02-20 00:00:00 to 2024-03-01 00:00:00
正在分析窗口：2024-02-21 00:00:00 to 2024-03-04 00:00:00...
完成窗口：2024-02-21 00:00:00 to 2024-03-04 00:00:00
正在分析窗口：2024-02-22 00:00:00 to 2024-03-05 00:00:00...
完成窗口：2024-02-22 00:00:00 to 2024-03-05

KeyboardInterrupt: 