In [2]:
import pandas as pd
import numpy as np
import os
import sys

# --- 0. 경로 및 설정 ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(PROJECT_ROOT, "processed")

# [입력 1] Phase 1-C의 결과물 (Price + Fin + Macro)
MASTER_FILE = os.path.join(DATA_DIR, "master_c_result.csv")
# [입력 2] IMO 이벤트 파일
IMO_FILE = os.path.join(DATA_DIR, "imo_all_with_regstage.csv")
# [입력 3] 방금 만든 Plan B 수주 파일 (!!! NEW !!!)
ORDERS_FILE = os.path.join(DATA_DIR, "orders_processed_plan_b.csv")

# [출력] 12채널이 모두 완성된 "진짜" 최종 마스터 테이블
FINAL_MASTER_FILE = os.path.join(DATA_DIR, "final_master_table_v2.csv") # 새 이름으로 저장

print(f"Loading base file: {MASTER_FILE}")
print(f"Merging IMO file: {IMO_FILE}")
print(f"Merging Orders (Plan B) file: {ORDERS_FILE}")

# --- 1. [Phase 1-C] 결과물 로드 ---
try:
    master_df = pd.read_csv(MASTER_FILE, parse_dates=['date'])
    master_df['ticker'] = master_df['ticker'].astype(str)
except FileNotFoundError:
    print(f"\n[!!!] Error: {MASTER_FILE}을 찾을 수 없습니다.")
    sys.exit()

# --- 2. [Phase 1-B] IMO 이벤트 병합 (이전과 동일) ---
try:
    imo_df = pd.read_csv(IMO_FILE, parse_dates=['date_iso'])
    imo_df_finalized = imo_df[imo_df['reg_stage'] == 'finalized_regulation'].copy()
    imo_df_finalized['imo_event_impulse'] = 1
    imo_df_finalized = imo_df_finalized.groupby('date_iso')['imo_event_impulse'].max().reset_index()
    imo_df_finalized = imo_df_finalized.rename(columns={'date_iso':'date'})
    master_df = pd.merge(master_df, imo_df_finalized, on='date', how='left')
    master_df['imo_event_impulse'] = master_df['imo_event_impulse'].fillna(0)
    print("  > Merged IMO event data (impulse)")
except FileNotFoundError:
    print(f"\n[!!!] Error: {IMO_FILE}을 찾을 수 없습니다.")
    sys.exit()

# --- 3. [Phase 1-B] 수주 (Plan B) 이벤트 병합 (!!! NEW !!!) ---
try:
    orders_df = pd.read_csv(ORDERS_FILE, parse_dates=['date'])
    orders_df['ticker'] = orders_df['ticker'].astype(str)
    
    # [병합] master_df에 'date'와 'ticker' 기준으로 병합
    master_df = pd.merge(master_df, orders_df, on=['date', 'ticker'], how='left')
    
    # [NaN 처리] 이벤트가 없었던 날은 NaN이므로 0으로 채움
    master_df['new_order_event_impulse'] = master_df['new_order_event_impulse'].fillna(0)
    print("  > Merged Orders (Plan B) event data (impulse)")
    
except FileNotFoundError:
    print(f"\n[!!!] Error: {ORDERS_FILE}을 찾을 수 없습니다.")
    sys.exit()

# --- 4. [Phase 1-D] 최종 파생 변수 생성 ---
print("  > Creating final derived variables...")

# 4-1. 로그 변환 / 수익률 (이전과 동일)
master_df['close_log'] = np.log1p(master_df['close'])
master_df['trading_volume_log'] = np.log1p(master_df['trading_volume'])
master_df = master_df.sort_values(by=['ticker', 'date'])
master_df['ret_1d'] = master_df.groupby('ticker')['close_log'].diff(1)

# 4-2. 수주 "횟수" 계단 (Stair) 채널 생성 (!!! NEW !!!)
# (월별로 'new_order_event_impulse' 횟수를 누적 합산)
master_df['new_order_count_stair'] = master_df.groupby(
    ['ticker', pd.Grouper(key='date', freq='M')]
)['new_order_event_impulse'].transform('sum')

# 월별 합계 데이터를 ffill하여 해당 월 전체에 적용
master_df['new_order_count_stair'] = master_df.groupby('ticker')['new_order_count_stair'].ffill()
# (초기 NaN은 0으로 채움)
master_df['new_order_count_stair'] = master_df['new_order_count_stair'].fillna(0)
print("  > Created 'new_order_count_stair' (Plan B)")


# 4-3. IMO 이벤트 감쇠 (Decay) 채널 (이전과 동일)
def calculate_decay(group, impulse_col, decay_factor=0.95):
    decay_series = pd.Series(0.0, index=group.index)
    last_decay_value = 0.0
    for i, (index, row) in enumerate(group.iterrows()):
        impulse = row[impulse_col]
        current_decay = max(last_decay_value * decay_factor, impulse)
        decay_series.iloc[i] = current_decay
        last_decay_value = current_decay
    return decay_series

master_df['imo_event_decay'] = master_df.groupby('ticker', group_keys=False).apply(
    calculate_decay, 'imo_event_impulse'
)
print("  > Calculated IMO event decay channel")

# --- 5. 최종 컬럼 선택 및 저장 ---
# [!!! Plan B로 수정된 최종 채널 리스트 !!!]
FINAL_CHANNELS = [
    'date', 'ticker',
    'close_log',                # 1
    'ret_1d',                   # 2
    'trading_volume_log',       # 3
    'roe',                      # 4
    'real_debt_ratio',          # 5
    'new_order_event_impulse',  # 6 (Plan B)
    'new_order_count_stair',    # 7 (Plan B)
    'bdi_proxy',                # 8 
    'wti',                      # 9 
    'newbuild_proxy_2015_100',  # 10
    'imo_event_impulse',        # 11
    'imo_event_decay'           # 12
]

final_master_table = master_df[FINAL_CHANNELS].copy()
final_master_table = final_master_table.dropna() # 초기 NaN 제거

# [저장]
final_master_table.to_csv(FINAL_MASTER_FILE, index=False)

print(f"\n[SUCCESS] Phase 1 (Data Layer) 'True' Final Version 완료!")
print(f"12채널이 모두 포함된 'final_master_table_v2.csv'을 저장했습니다.")
print(f"  -> {FINAL_MASTER_FILE}")
print("\n--- Final Table Info ---")
final_master_table.info()

Loading base file: /workspace/ship-ai/data/processed/master_c_result.csv
Merging IMO file: /workspace/ship-ai/data/processed/imo_all_with_regstage.csv
Merging Orders (Plan B) file: /workspace/ship-ai/data/processed/orders_processed_plan_b.csv
  > Merged IMO event data (impulse)
  > Merged Orders (Plan B) event data (impulse)
  > Creating final derived variables...
  > Created 'new_order_count_stair' (Plan B)
  > Calculated IMO event decay channel

[SUCCESS] Phase 1 (Data Layer) 'True' Final Version 완료!
12채널이 모두 포함된 'final_master_table_v2.csv'을 저장했습니다.
  -> /workspace/ship-ai/data/processed/final_master_table_v2.csv

--- Final Table Info ---
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5336 entries, 1566 to 1476
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   date                     5336 non-null   datetime64[ns]
 1   ticker                   5336 non-null   object      