In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

tqdm.pandas()
EPS = 1e-8

# ------------------------
# 1) Load raw CSV
# ------------------------
data_path = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\df_sp500.csv"
df = pd.read_csv(data_path)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['stock_code', 'date'])

# ------------------------
# 2) Set MultiIndex for group operations
# ------------------------
df.set_index(['date', 'stock_code'], inplace=True)
df.sort_index(inplace=True)

# ------------------------
# 3) Prepare feature container
# ------------------------
features = pd.DataFrame(index=df.index)

close_p = df['close']
high_p = df['high']
low_p = df['low']
open_p = df['open']
volume_p = df['volume']
money_p = df['money']

# ------------------------
# 4) Returns and log returns
# ------------------------
for n in [1,2,3,5,10,20,60,120]:
    features[f'RET_{n}'] = df.groupby('stock_code')['close'].transform(lambda x: x.pct_change(n))
    features[f'LOGRET_{n}'] = np.log(df['close'] + EPS) - np.log(df.groupby('stock_code')['close'].shift(n) + EPS)

# ------------------------
# 5) Rolling statistics
# ------------------------
windows = [5,10,20,60,120]
for w in tqdm(windows, desc="Rolling stats"):
    features[f'ROLLMEAN_{w}'] = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).mean())
    features[f'ROLLSTD_{w}'] = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).std())
    features[f'ROLLMAX_{w}'] = df.groupby('stock_code')['high'].transform(lambda x: x.rolling(w).max())
    features[f'ROLLMIN_{w}'] = df.groupby('stock_code')['low'].transform(lambda x: x.rolling(w).min())

# ------------------------
# 6) RSI indicators
# ------------------------
rsi_windows = [6,12,24]
for w in tqdm(rsi_windows, desc="RSI calc"):
    def rsi_calc(x):
        delta = x.diff()
        gain = delta.clip(lower=0)
        loss = -delta.clip(upper=0)
        avg_gain = gain.rolling(w).mean()
        avg_loss = loss.rolling(w).mean()
        rs = avg_gain / (avg_loss + EPS)
        return 100 - (100 / (1 + rs))
    
    features[f'RSI_{w}'] = df.groupby('stock_code')['close'].transform(rsi_calc)

# ------------------------
# 7) Bollinger Bands
# ------------------------
for w in tqdm(windows, desc="Bollinger Bands"):
    rolling_mean = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).mean())
    rolling_std = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).std())
    features[f'BOLL_UP_{w}'] = rolling_mean + 2 * rolling_std
    features[f'BOLL_DOWN_{w}'] = rolling_mean - 2 * rolling_std
    features[f'BOLL_WIDTH_{w}'] = features[f'BOLL_UP_{w}'] - features[f'BOLL_DOWN_{w}']

# ------------------------
# 8) Volume indicators
# ------------------------
for w in tqdm(windows, desc="Volume stats"):
    features[f'VOL_MEAN_{w}'] = df.groupby('stock_code')['volume'].transform(lambda x: x.rolling(w).mean())
    features[f'VOL_STD_{w}'] = df.groupby('stock_code')['volume'].transform(lambda x: x.rolling(w).std())
    features[f'VOL_MONEY_MEAN_{w}'] = df.groupby('stock_code')['money'].transform(lambda x: x.rolling(w).mean())

# ------------------------
# 9) Momentum indicators
# ------------------------
for n in tqdm([1,2,3,5,10,20,60,120], desc="Momentum"):
    features[f'MOM_{n}'] = df.groupby('stock_code')['close'].transform(lambda x: x.diff(n))

# ------------------------
# 10) High-low, open-close, and range-based cross-sectional features
# ------------------------
for w in tqdm(windows, desc="Cross-sectional features"):
    high_roll = df.groupby('stock_code')['high'].transform(lambda x: x.rolling(w).max())
    low_roll = df.groupby('stock_code')['low'].transform(lambda x: x.rolling(w).min())
    close_roll = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).max())
    features[f'HL_RANGE_{w}'] = high_roll - low_roll
    features[f'OC_DIFF_{w}'] = df['close'] - df['open']
    features[f'CLOSE_STD_{w}'] = df.groupby('stock_code')['close'].transform(lambda x: x.rolling(w).std())
    features[f'VWAP_PROXY_{w}'] = (df['money'].rolling(w).sum() / (df['volume'].rolling(w).sum() + EPS))

# ------------------------
# 11) Additional alpha-style features (Amihud, turnover proxies, etc.)
# ------------------------
features['AMIHUD'] = (abs(df['change']) / (df['money'] + EPS)).groupby(df.index.get_level_values(1)).transform('mean')
features['52W_HIGH'] = df.groupby('stock_code')['high'].transform(lambda x: x.rolling(252).max())
features['52W_LOW'] = df.groupby('stock_code')['low'].transform(lambda x: x.rolling(252).min())

# ------------------------
# 12) Drop constant or all-NaN columns
# ------------------------
features = features.dropna(axis=1, how='all')
features = features.loc[:, features.nunique() > 1]



# ------------------------
# 13) Save each feature to separate CSV
# ------------------------
out_path = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\alpha158"

# create folder if it does not exist
os.makedirs(out_path, exist_ok=True)

for col in tqdm(features.columns, desc="Saving features"):
    df_col = features[col].unstack()  # columns=stock_code, rows=date
    df_col = df_col.fillna(method='ffill')
    df_col.to_csv(os.path.join(out_path, col + '.csv'))

print("All Alpha158 features generated and saved.")



  df['date'] = pd.to_datetime(df['date'])
Rolling stats: 100%|██████████| 5/5 [00:07<00:00,  1.42s/it]
RSI calc: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]
Bollinger Bands: 100%|██████████| 5/5 [00:03<00:00,  1.31it/s]
Volume stats: 100%|██████████| 5/5 [00:05<00:00,  1.17s/it]
Momentum: 100%|██████████| 8/8 [00:02<00:00,  3.15it/s]
Cross-sectional features: 100%|██████████| 5/5 [00:07<00:00,  1.57s/it]
  df_col = df_col.fillna(method='ffill')
Saving features: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]

All Alpha158 features generated and saved.



