In [9]:
import pandas as pd

In [10]:
data = pd.read_csv("C:\\Users\\solba\\KERI-ERICA-2025-winter-project\\data\\interim\\dataset.csv")

In [11]:
data

Unnamed: 0,일자,충전방식,0시,1시,2시,3시,4시,5시,6시,7시,...,14시,15시,16시,17시,18시,19시,20시,21시,22시,23시
0,2020-01-01,급속,2800,3200,2320,1600,1520,2240,3120,3800,...,12040,11080,12320,11800,9800,8160,6880,6080,4360,3800
1,2020-01-01,완속,700,805,791,259,84,91,168,189,...,595,784,952,1085,1372,903,1218,1197,1148,1078
2,2020-01-02,급속,1400,1240,1000,680,880,1280,2080,4000,...,10720,11960,11280,11080,9840,10040,7800,5600,4800,3840
3,2020-01-02,완속,413,413,133,133,28,70,105,147,...,483,945,1064,1099,1554,1449,1253,1358,1386,1680
4,2020-01-03,급속,2240,1480,1240,640,920,1320,2320,4240,...,11120,11600,11720,11080,9560,9160,7640,6080,5520,4880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465,2024-09-28,완속,735,714,322,245,147,126,245,434,...,1232,1113,1484,1505,1946,1911,1792,1736,1694,1645
3466,2024-09-29,급속,2440,2240,1560,1520,2120,3320,5400,6920,...,16440,14640,15320,15520,13840,11160,10280,6400,5720,3520
3467,2024-09-29,완속,749,434,315,259,147,182,196,350,...,1253,1337,1498,1715,2107,1904,1820,1953,1351,924
3468,2024-09-30,급속,1960,1800,1200,1240,2640,4080,7280,9840,...,17960,20080,20480,20160,17280,14400,12240,8800,6840,4280


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import entropy

In [13]:
def preprocess_data(data):
    """데이터 기본 정제 및 타입 변환"""
    df = data.copy()
    df['일자'] = pd.to_datetime(df['일자'])
    
    # 0시~23시 컬럼 수치형 변환 및 결측치 처리
    hourly_cols = [f'{i}시' for i in range(24)]
    for col in hourly_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    return df

In [14]:
df_step1 = preprocess_data(data)
df_step1

Unnamed: 0,일자,충전방식,0시,1시,2시,3시,4시,5시,6시,7시,...,14시,15시,16시,17시,18시,19시,20시,21시,22시,23시
0,2020-01-01,급속,2800,3200,2320,1600,1520,2240,3120,3800,...,12040,11080,12320,11800,9800,8160,6880,6080,4360,3800
1,2020-01-01,완속,700,805,791,259,84,91,168,189,...,595,784,952,1085,1372,903,1218,1197,1148,1078
2,2020-01-02,급속,1400,1240,1000,680,880,1280,2080,4000,...,10720,11960,11280,11080,9840,10040,7800,5600,4800,3840
3,2020-01-02,완속,413,413,133,133,28,70,105,147,...,483,945,1064,1099,1554,1449,1253,1358,1386,1680
4,2020-01-03,급속,2240,1480,1240,640,920,1320,2320,4240,...,11120,11600,11720,11080,9560,9160,7640,6080,5520,4880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465,2024-09-28,완속,735,714,322,245,147,126,245,434,...,1232,1113,1484,1505,1946,1911,1792,1736,1694,1645
3466,2024-09-29,급속,2440,2240,1560,1520,2120,3320,5400,6920,...,16440,14640,15320,15520,13840,11160,10280,6400,5720,3520
3467,2024-09-29,완속,749,434,315,259,147,182,196,350,...,1253,1337,1498,1715,2107,1904,1820,1953,1351,924
3468,2024-09-30,급속,1960,1800,1200,1240,2640,4080,7280,9840,...,17960,20080,20480,20160,17280,14400,12240,8800,6840,4280


In [15]:
def add_daily_features(df):
    """일 단위 집계 및 시간대별 특성 추가"""
    hourly_cols = [f'{i}시' for i in range(24)]
    
    # 일일 총 사용량
    df['daily_total'] = df[hourly_cols].sum(axis=1)
    
    # Peak 시간대(08-20시) 비중 계산
    peak_cols = [f'{i}시' for i in range(8, 20)]
    df['peak_ratio'] = df[peak_cols].sum(axis=1) / (df['daily_total'] + 1e-9)
    
    return df

In [16]:
df_step2 = add_daily_features(df_step1)
df_step2

Unnamed: 0,일자,충전방식,0시,1시,2시,3시,4시,5시,6시,7시,...,16시,17시,18시,19시,20시,21시,22시,23시,daily_total,peak_ratio
0,2020-01-01,급속,2800,3200,2320,1600,1520,2240,3120,3800,...,12320,11800,9800,8160,6880,6080,4360,3800,169120,0.753311
1,2020-01-01,완속,700,805,791,259,84,91,168,189,...,952,1085,1372,903,1218,1197,1148,1078,16387,0.528407
2,2020-01-02,급속,1400,1240,1000,680,880,1280,2080,4000,...,11280,11080,9840,10040,7800,5600,4800,3840,155960,0.778148
3,2020-01-02,완속,413,413,133,133,28,70,105,147,...,1064,1099,1554,1449,1253,1358,1386,1680,16240,0.561638
4,2020-01-03,급속,2240,1480,1240,640,920,1320,2320,4240,...,11720,11080,9560,9160,7640,6080,5520,4880,160400,0.759850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3465,2024-09-28,완속,735,714,322,245,147,126,245,434,...,1484,1505,1946,1911,1792,1736,1694,1645,24003,0.590260
3466,2024-09-29,급속,2440,2240,1560,1520,2120,3320,5400,6920,...,15320,15520,13840,11160,10280,6400,5720,3520,221880,0.768163
3467,2024-09-29,완속,749,434,315,259,147,182,196,350,...,1498,1715,2107,1904,1820,1953,1351,924,23149,0.625038
3468,2024-09-30,급속,1960,1800,1200,1240,2640,4080,7280,9840,...,20480,20160,17280,14400,12240,8800,6840,4280,273000,0.772161


In [None]:
def extract_fft_features(daily_seq, top_k=3):
    """주어진 시퀀스(28일)에 대해 FFT 특징 추출"""
    n = len(daily_seq) # n = 28
    detrended_seq = daily_seq - np.mean(daily_seq) # daily_seq에서 평균 뺀 행렬(DC Offset 제거)
    fft_vals = np.fft.rfft(detrended_seq) # Discreate Fourier Transform
    amplitudes = np.abs(fft_vals) # 복소평면 속 유클리디안 거리 -> 진폭 계산
    freqs = np.fft.rfftfreq(n) # 주파수 행렬 생성
    
    features = {}
    
    # 상위 K개 진폭 및 해당 주파수
    top_indices = np.argsort(amplitudes)[-top_k:][::-1] # 상위 진폭 3개 추출
    for j, idx in enumerate(top_indices): # j=0,1,2; idx=진폭 인덱스 6개 Feature 생성
        features[f'fft_amp_{j}'] = amplitudes[idx]
        features[f'fft_freq_{j}'] = freqs[idx]
    # print(features)
        
    # Spectral 특성
    psd = amplitudes**2 # Power Spectral Density, 28일 주기, 14일 주기, ... 2일 주기, 1일 주기 신호의 에너지 분포
    features['total_power'] = np.sum(psd) # 전체 에너지 합
    # print(features)
    
    # Spectral Entropy (에너지 분포의 불확실성)
    psd_norm = psd / (np.sum(psd) + 1e-9) # 확률분포 정규화
    features['spectral_entropy'] = entropy(psd_norm) # 엔트로피 계산
    
    # Band Power (저주파/고주파 에너지 비중)
    mid_idx = len(psd) // 2
    features['low_freq_power'] = np.sum(psd[:mid_idx]) # 저주파 에너지 합
    features['high_freq_power'] = np.sum(psd[mid_idx:]) # 고주파 에너지 합
    
    return features

def build_sliding_window_samples(type_df, lookback=28, horizon=7, top_k=3):
    """특정 충전방식 데이터에 대해 슬라이딩 윈도우 샘플 생성"""
    type_df = type_df.sort_values('일자').reset_index(drop=True)
    n_rows = len(type_df)
    samples = []
    
    # i는 현재(t) 시점의 인덱스
    for i in range(lookback - 1, n_rows - horizon):
        # 1. 시퀀스 데이터 추출 (Leakage 방지: t+1 이후 데이터 미포함)
        window = type_df.iloc[i - (lookback-1) : i + 1]
        target_window = type_df.iloc[i + 1 : i + 1 + horizon]
        
        daily_seq = window['daily_total'].values
        # print(daily_seq)
        
        # 2. 특징 구성 (Time-domain)
        current_date = type_df.iloc[i]['일자']
        feat = {
            'window_end_date': current_date,
            'charging_type': type_df.iloc[i]['충전방식'],
            'y_next7_total': target_window['daily_total'].sum(),
            'month': current_date.month,
            'dayofweek': current_date.dayofweek,
            'mean_28d': np.mean(daily_seq),
            'std_28d': np.std(daily_seq),
            'last_day_usage': daily_seq[-1],
            'peak_ratio_mean': window['peak_ratio'].mean()
        }
        
        # 3. FFT 특징 결합
        fft_feat = extract_fft_features(daily_seq, top_k=top_k)
        feat.update(fft_feat)
        
        samples.append(feat)
        
    return samples

def build_train_df(df, lookback=28, horizon=7, top_k=3):
    """전체 데이터에 대해 충전방식별 윈도우를 생성하고 병합"""
    all_samples = []
    charging_types = df['충전방식'].unique()
    
    for c_type in charging_types:
        type_df = df[df['충전방식'] == c_type]
        type_samples = build_sliding_window_samples(type_df, lookback, horizon, top_k)
        all_samples.extend(type_samples)
        # print(f"Windows generated for {c_type}: {len(type_samples)}")
        
    train_df = pd.DataFrame(all_samples)
    
    # 시간 순 정렬 (중요: 검증 시 Leakage 방지)
    train_df = train_df.sort_values(['window_end_date', 'charging_type']).reset_index(drop=True)
    return train_df

In [48]:
train_df = build_train_df(df_step2, lookback=28, horizon=7, top_k=3)


{'window_end_date': Timestamp('2020-01-28 00:00:00'), 'charging_type': '급속', 'y_next7_total': np.int64(990920), 'month': 1, 'dayofweek': 1, 'mean_28d': np.float64(155388.57142857142), 'std_28d': np.float64(14572.043124295458), 'last_day_usage': np.int64(150280), 'peak_ratio_mean': np.float64(0.7630127995921322), 'fft_amp_0': np.float64(186271.4651363041), 'fft_freq_0': np.float64(0.14285714285714285), 'fft_amp_1': np.float64(130537.5917536912), 'fft_freq_1': np.float64(0.03571428571428571), 'fft_amp_2': np.float64(110619.24316352539), 'fft_freq_2': np.float64(0.10714285714285714), 'total_power': np.float64(83239945600.0), 'spectral_entropy': np.float64(1.7825133873906565), 'low_freq_power': np.float64(69936722353.58743), 'high_freq_power': np.float64(13303223246.41257)}
{'window_end_date': Timestamp('2020-01-29 00:00:00'), 'charging_type': '급속', 'y_next7_total': np.int64(1011120), 'month': 1, 'dayofweek': 2, 'mean_28d': np.float64(154417.14285714287), 'std_28d': np.float64(14530.843761

In [19]:
train_df

Unnamed: 0,window_end_date,charging_type,y_next7_total,month,dayofweek,mean_28d,std_28d,last_day_usage,peak_ratio_mean,fft_amp_0,fft_freq_0,fft_amp_1,fft_freq_1,fft_amp_2,fft_freq_2,total_power,spectral_entropy,low_freq_power,high_freq_power
0,2020-01-28,급속,990920,1,1,155388.571429,14572.043124,150280,0.763013,186271.465136,0.142857,130537.591754,0.035714,110619.243164,0.107143,8.323995e+10,1.782513,6.993672e+10,1.330322e+10
1,2020-01-28,완속,117663,1,1,16744.750000,2044.146689,16856,0.570670,26775.368308,0.142857,17723.322530,0.035714,12523.750723,0.107143,1.640356e+09,1.800538,1.343571e+09,2.967851e+08
2,2020-01-29,급속,1011120,1,2,154417.142857,14530.843761,141920,0.763327,166505.216155,0.142857,157512.250348,0.035714,99766.614180,0.107143,8.310286e+10,1.833672,6.898599e+10,1.411686e+10
3,2020-01-29,완속,119497,1,2,16771.750000,2044.235844,17143,0.571938,27525.878572,0.142857,17028.333516,0.035714,12775.817843,0.107143,1.642430e+09,1.773266,1.353154e+09,2.892762e+08
4,2020-01-30,급속,1038720,1,3,153880.000000,14740.354716,140920,0.762532,172473.420486,0.035714,151525.945685,0.142857,87855.156519,0.107143,8.523132e+10,1.889306,7.034628e+10,1.488504e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3397,2024-09-21,완속,173964,9,5,26173.500000,1679.780120,22722,0.624278,17915.737722,0.071429,15080.888750,0.142857,13538.730993,0.035714,1.117420e+09,2.051529,9.002299e+08,2.171901e+08
3398,2024-09-22,급속,1872200,9,6,277685.714286,31020.313555,215280,0.764605,297602.396448,0.071429,286365.140430,0.142857,265007.623917,0.035714,3.814305e+11,2.046286,2.755320e+11,1.058985e+11
3399,2024-09-22,완속,173803,9,6,26112.250000,1750.327023,23310,0.625054,17890.042702,0.071429,16733.344380,0.142857,15180.022092,0.035714,1.221912e+09,2.057880,1.000746e+09,2.211658e+08
3400,2024-09-23,급속,1875440,9,0,276635.714286,30772.249724,269760,0.764421,306873.189880,0.142857,292763.319662,0.071429,289371.585779,0.035714,3.731515e+11,1.944911,2.924825e+11,8.066901e+10


In [20]:
def make_time_series_folds(train_df, n_splits=5):
    """
    날짜 기반의 시계열 교차 검증 폴드 생성.
    TimeSeriesSplit의 한계: 동일 날짜에 여러 샘플(급속/완속)이 있을 때 
    Index 기반으로 자르면 같은 날짜 데이터가 Train/Val로 찢어질 위험이 있음.
    대안: 유니크한 날짜를 기준으로 Split 포인트를 계산.
    """
    unique_dates = train_df['window_end_date'].unique()
    unique_dates = np.sort(unique_dates)
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    folds = []
    
    for train_date_idx, val_date_idx in tscv.split(unique_dates):
        # 훈련/검증에 해당하는 날짜들 추출
        train_dates = unique_dates[train_date_idx]
        val_dates = unique_dates[val_date_idx]
        
        # 해당 날짜를 가진 행의 인덱스 추출
        train_indices = train_df[train_df['window_end_date'].isin(train_dates)].index.tolist()
        val_indices = train_df[train_df['window_end_date'].isin(val_dates)].index.tolist()
        
        folds.append((train_indices, val_indices))
        
    return folds

cv_folds = make_time_series_folds(train_df, n_splits=5)
for i, (f_train, f_val) in enumerate(cv_folds):
    print(f"Fold {i+1}: Train size={len(f_train)}, Val size={len(f_val)}")

Fold 1: Train size=572, Val size=566
Fold 2: Train size=1138, Val size=566
Fold 3: Train size=1704, Val size=566
Fold 4: Train size=2270, Val size=566
Fold 5: Train size=2836, Val size=566
