In [None]:
import pandas as pd
import numpy as np

WINDOW_SIZE = 3
FILE_PATH = 'icn500개.csv'

try:
    df = pd.read_csv(FILE_PATH, sep='\t')
except FileNotFoundError:
    print(f"오류: '{FILE_PATH}' 파일을 찾을 수 없습니다.")
    exit()

df['event_time'] = pd.to_datetime(df['event_time'])

unique_event_types = df['event_type'].unique().tolist()

event_type_vocab = ['<PAD>', '<UNK>'] + unique_event_types
event_type_to_idx = {etype: i for i, etype in enumerate(event_type_vocab)}
print("--- 생성된 단어장 (사전) ---")
print(f"총 단어 수: {len(event_type_to_idx)}")
print(event_type_to_idx)
print("-" * 35)

sequence_set_text = []
grouped = df.groupby('epc_code')

for epc, group_df in grouped:
    sorted_group = group_df.sort_values(by='event_time').reset_index(drop=True)
    processed_group = sorted_group[['location_id', 'event_type', 'event_time']]
    if len(processed_group) < WINDOW_SIZE:
        pad_len = WINDOW_SIZE - len(processed_group)
        padding_df = pd.DataFrame({'location_id': [0] * pad_len, 'event_type': ['<PAD>'] * pad_len, 'event_time': pd.to_datetime([pd.NaT] * pad_len)})
        padded_sequence = pd.concat([padding_df, processed_group], ignore_index=True)
        sequence_set_text.append(padded_sequence)
    else:
        num_windows = len(processed_group) - WINDOW_SIZE + 1
        for i in range(num_windows):
            window = processed_group.iloc[i : i + WINDOW_SIZE].reset_index(drop=True)
            sequence_set_text.append(window)

sequence_set_processed = []

unk_idx = event_type_to_idx['<UNK>']

for seq_df in sequence_set_text:
    df_copy = seq_df.copy()

    mapped_series = df_copy['event_type'].map(event_type_to_idx)
    df_copy['event_type'] = mapped_series.fillna(unk_idx).astype(int)
    
    def time_to_tan(t):
        if pd.isna(t):
            return 0.0
        seconds_in_day = 24 * 60 * 60
        seconds_from_midnight = t.hour * 3600 + t.minute * 60 + t.second
        normalized_time = seconds_from_midnight / seconds_in_day
        return np.tan((normalized_time - 0.5) * np.pi)

    df_copy['time_tan'] = df_copy['event_time'].apply(time_to_tan)
    
    final_df = df_copy[['location_id', 'event_type', 'time_tan']]
    sequence_set_processed.append(final_df)

print(f"윈도우 크기 (WINDOW_SIZE): {WINDOW_SIZE}")
print(f"총 {len(sequence_set_processed)}개의 시퀀스(DataFrame)가 최종 변환되었습니다.\n")

print("--- 모델 입력용 최종 데이터 예시 ---")

for i, seq_df in enumerate(sequence_set_processed[:5]):
    print(f"\n시퀀스 {i+1}:")
    print(seq_df)
    if i < 4:
        print("-" * 35)

--- 생성된 단어장 (사전) ---
총 단어 수: 12
{'<PAD>': 0, '<UNK>': 1, 'HUB_Outbound': 2, 'R_Stock_Inbound': 3, 'R_Stock_Outbound': 4, 'POS_Sell': 5, 'W_Stock_Inbound': 6, 'HUB_Inbound': 7, 'Aggregation': 8, 'WMS_Inbound': 9, 'WMS_Outbound': 10, 'W_Stock_Outbound': 11}
-----------------------------------
윈도우 크기 (WINDOW_SIZE): 3
총 2825개의 시퀀스(DataFrame)가 최종 변환되었습니다.

--- 모델 입력용 최종 데이터 예시 ---

시퀀스 1:
   location_id  event_type  time_tan
0            1           8 -0.188726
1            5           9 -0.125812
2            5          10  0.759156
-----------------------------------

시퀀스 2:
   location_id  event_type  time_tan
0            5           9 -0.125812
1            5          10  0.759156
2           13           7 -3.441089
-----------------------------------

시퀀스 3:
   location_id  event_type  time_tan
0            5          10  0.759156
1           13           7 -3.441089
2           13           2  0.177752
-----------------------------------

시퀀스 4:
   location_id  event_type  time_tan
