# 02 — Feature Engineering & Labeling
สร้างฟีเจอร์พื้นฐานจาก SET50 (returns + lags) และกำหนด Label สำหรับการพยากรณ์วันถัดไป

In [1]:

# --- Setup ---
%pip -q install yfinance pandas numpy matplotlib ta scikit-learn


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ta (setup.py) ... [?25l[?25hdone


In [3]:

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

INDEX_TICKER = "^SET.BK"
START = "2015-01-01"
END   = "2025-08-22"

# Option A) Load from previous step if available
try:
    idx = pd.read_csv("idx_clean.csv", parse_dates=["Date"], index_col="Date")
except Exception as e:
    print("idx_clean.csv not found, downloading via yfinance ...")
    idx = yf.download(INDEX_TICKER, start=START, end=END)
    idx = idx[['Open','High','Low','Close','Close','Volume']].ffill()

print(idx.head())


  idx = yf.download(INDEX_TICKER, start=START, end=END)
[*********************100%***********************]  1 of 1 completed

idx_clean.csv not found, downloading via yfinance ...
Price              Open         High          Low        Close               \
Ticker          ^SET.BK      ^SET.BK      ^SET.BK      ^SET.BK      ^SET.BK   
Date                                                                          
2015-01-05  1499.699951  1502.099976  1482.869995  1483.250000  1483.250000   
2015-01-06  1467.969971  1478.349976  1459.219971  1477.579956  1477.579956   
2015-01-07  1478.839966  1500.829956  1478.260010  1500.750000  1500.750000   
2015-01-08  1514.900024  1523.319946  1514.180054  1521.619995  1521.619995   
2015-01-09  1529.500000  1531.209961  1522.750000  1529.420044  1529.420044   

Price         Volume  
Ticker       ^SET.BK  
Date                  
2015-01-05   7129200  
2015-01-06  31337800  
2015-01-07  12400600  
2015-01-08  18000300  
2015-01-09  32278600  





In [7]:
# --- Build features (robust to MultiIndex) ---
import pandas as pd

df = idx.copy()

# เลือกราคาปิดที่เป็นซีรีส์เดียว (ถ้ามี 'Adj Close' ใช้ตัวนั้นก่อน, ไม่มีก็ใช้ 'Close')
if "Adj Close" in df.columns:
    close = df["Adj Close"]
elif "Close" in df.columns:
    close = df["Close"]
else:
    # กรณีเป็น MultiIndex (เช่นจากการโหลดหลายสัญลักษณ์)
    # พยายามดึงเลเวลแรกชื่อ 'Adj Close' หรือ 'Close'
    if isinstance(df.columns, pd.MultiIndex):
        if "Adj Close" in df.columns.get_level_values(0):
            close = df["Adj Close"]
        else:
            close = df["Close"]
    else:
        raise ValueError("No 'Close' or 'Adj Close' column found")

# ถ้า close ยังเป็น DataFrame หลายคอลัมน์ (เช่นหลาย ticker) ให้เลือกคอลัมน์แรก หรือเลือกชื่อที่ต้องการ
if isinstance(close, pd.DataFrame):
    # เลือกคอลัมน์แรก; ถ้าต้องการสัญลักษณ์เฉพาะ ให้ใช้: close = close['^SET50'] หรือ close['PTT.BK']
    close = close.iloc[:, 0]

close.name = "Close"

# คำนวณผลตอบแทน
df = pd.DataFrame({"Close": close})
df["Return"] = df["Close"].pct_change()

# Lag features (ใช้ข้อมูลอดีตเท่านั้น)
for k in [1, 2, 3, 5, 10]:
    df[f"Lag{k}"] = df["Return"].shift(k)

# Rolling stats
df["Vol_5"] = df["Return"].rolling(5).std()
df["EMA_10"] = df["Close"].ewm(span=10, adjust=False).mean()
df["EMA_20"] = df["Close"].ewm(span=20, adjust=False).mean()
df["EMA_gap"] = (df["EMA_10"] - df["EMA_20"]) / df["EMA_20"]

# Label: next-day return
df["Target"] = df["Return"].shift(-1)

# ลบ NaN จากการ shift/rolling
df = df.dropna().copy()
print("Columns used:", df.columns.tolist())
df.tail()


Columns used: ['Close', 'Return', 'Lag1', 'Lag2', 'Lag3', 'Lag5', 'Lag10', 'Vol_5', 'EMA_10', 'EMA_20', 'EMA_gap', 'Target']


Unnamed: 0_level_0,Close,Return,Lag1,Lag2,Lag3,Lag5,Lag10,Vol_5,EMA_10,EMA_20,EMA_gap,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2025-08-14,1266.670044,-0.008423,0.014582,-0.004806,0.000538,0.014283,0.013581,0.010646,1250.811978,1225.293954,0.020826,-0.005724
2025-08-15,1259.420044,-0.005724,-0.008423,0.014582,-0.004806,0.014042,0.008479,0.009176,1252.377081,1228.544058,0.019399,-0.013586
2025-08-18,1242.310059,-0.013586,-0.005724,-0.008423,0.014582,0.000538,-0.001439,0.010718,1250.546713,1229.855106,0.016824,-0.005272
2025-08-19,1235.76001,-0.005272,-0.013586,-0.005724,-0.008423,-0.004806,-0.019334,0.010734,1247.858221,1230.417478,0.014175,0.01001
2025-08-20,1248.130005,0.01001,-0.005272,-0.013586,-0.005724,0.014582,0.009086,0.008811,1247.907637,1232.104385,0.012826,-0.002676


In [8]:

# --- Train/Val/Test split (time-based, no shuffle) ---
features = [c for c in df.columns if c not in ["Open","High","Low","Close","Adj Close","Volume","Target"]]
X = df[features]
y = df["Target"]

# 70% train, 15% val, 15% test (sequential split)
n = len(df)
n_train = int(n*0.7)
n_val   = int(n*0.85)

X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_val,   y_val   = X.iloc[n_train:n_val], y.iloc[n_train:n_val]
X_test,  y_test  = X.iloc[n_val:], y.iloc[n_val:]

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)

# Save processed dataset
proc = pd.concat([X, y], axis=1)
proc.to_csv("dataset_features_labels.csv")
print("Saved dataset_features_labels.csv")


Shapes: (1795, 10) (385, 10) (385, 10)
Saved dataset_features_labels.csv


### Note: Scaling without leakage (ถ้าจำเป็น)
ถ้าจะใช้สเกลเลอร์ (เช่น MinMax/Standard) ให้ `fit` เฉพาะ **train** แล้ว `transform` val/test เท่านั้น:

In [9]:

# Example template (commented out)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler().fit(X_train)   # fit on train only
# X_train_s = scaler.transform(X_train)
# X_val_s   = scaler.transform(X_val)
# X_test_s  = scaler.transform(X_test)


> ✅ ผลลัพธ์ที่ควรได้: dataset_features_labels.csv + ชุด train/val/test ที่แบ่งตามเวลาอย่างถูกต้อง