In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import ta

In [None]:
# sMAPE 계산 함수
def smape(a, f):
    return 100 / len(a) * np.sum(2 * np.abs(f - a) / (np.abs(a) + np.abs(f)))

# 결과 저장용 딕셔너리
results = {}

# 종목별 학습 및 평가
folder_path = 'sp500_stocks'
for filename in os.listdir(folder_path):
    if not filename.endswith('.csv'):
        continue

    symbol = filename.replace('.csv', '')
    path = os.path.join(folder_path, filename)

    try:
        df = pd.read_csv(path).dropna()

        # 기술적 지표 추가
        df['MA20'] = ta.trend.sma_indicator(df['Close'], window=20)
        bb = ta.volatility.BollingerBands(df['Close'], window=20, window_dev=2)
        df['Upper'] = bb.bollinger_hband()
        df['Lower'] = bb.bollinger_lband()
        df['RSI'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
        df.dropna(inplace=True)

        # 데이터 준비
        features = ['Open', 'High', 'Low', 'Close', 'Volume', 'MA20', 'Upper', 'Lower', 'RSI']
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(df[features])

        sequence_length = 50
        X, y = [], []
        for i in range(len(scaled_data) - sequence_length):
            X.append(scaled_data[i:i+sequence_length])
            y.append(scaled_data[i+sequence_length][features.index('Close')])
        X = np.array(X)
        y = np.array(y)

        # 훈련/테스트 분리
        split = int(0.8 * len(X))
        X_train, X_test = X[:split], X[split:]
        y_train, y_test = y[:split], y[split:]

        # 모델 구성
        model = Sequential([
            LSTM(64, return_sequences=True, input_shape=(X.shape[1], X.shape[2])),
            Dropout(0.3),
            LSTM(32),
            Dropout(0.3),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        model.fit(X_train, y_train, epochs=50, batch_size=32,
                  validation_split=0.2, callbacks=[early_stop], verbose=0)

        # 예측 및 역변환
        pred = model.predict(X_test)
        close_index = features.index('Close')
        y_full = np.zeros((len(y_test), len(features)))
        pred_full = np.zeros((len(pred), len(features)))
        y_full[:, close_index] = y_test
        pred_full[:, close_index] = pred.flatten()
        true_rescaled = scaler.inverse_transform(y_full)[:, close_index]
        pred_rescaled = scaler.inverse_transform(pred_full)[:, close_index]

        # sMAPE 저장
        smape_val = smape(true_rescaled, pred_rescaled)
        results[symbol] = smape_val
        print(f"{symbol} 완료 - sMAPE: {smape_val:.2f}%")

    except Exception as e:
        print(f"{symbol} 실패: {e}")


In [None]:
# 기존 결과 정리
results_df = pd.DataFrame(sorted(results.items(), key=lambda x: x[1]), columns=['Symbol', 'sMAPE'])

# 5% 단위로 구간 설정 (최대 100%까지)
bins = np.arange(0, 105, 5)  
labels = [f'{i}~{i+5}%' for i in bins[:-1]]


results_df['sMAPE_Group'] = pd.cut(results_df['sMAPE'], bins=bins, labels=labels, right=False)

print("\n--- Top 10 종목 (sMAPE 낮은 순) ---")
print(results_df.sort_values('sMAPE').head(10))

print("\n--- sMAPE 오차 범위별 종목 개수 (5% 단위) ---")
print(results_df['sMAPE_Group'].value_counts().sort_index())

# 1~5%  : 72.5%
# 5~10% : 20.8% 


--- Top 10 종목 (sMAPE 낮은 순) ---
  Symbol     sMAPE sMAPE_Group
0   GILD  1.076772        0~5%
1     MO  1.226468        0~5%
2    CPB  1.253405        0~5%
3    KMB  1.372075        0~5%
4   FOXA  1.563924        0~5%
5    FOX  1.567218        0~5%
6    LYB  1.599551        0~5%
7      K  1.638793        0~5%
8     PM  1.648253        0~5%
9      O  1.671763        0~5%

--- sMAPE 오차 범위별 종목 개수 (5% 단위) ---
sMAPE_Group
0~5%       359
5~10%      103
10~15%      24
15~20%       7
20~25%       0
25~30%       0
30~35%       1
35~40%       0
40~45%       0
45~50%       0
50~55%       0
55~60%       0
60~65%       0
65~70%       0
70~75%       0
75~80%       0
80~85%       0
85~90%       0
90~95%       0
95~100%      0
Name: count, dtype: int64
