In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

# === Step 1. Load data ===
df = pd.read_csv(
    r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\SP500_Consolidated.zip",
    compression="zip"
)

# === Step 2. Standardize column names ===
df = df.rename(columns={
    "Ticker": "code",
    "date": "date"
})
df["date"] = pd.to_datetime(df["date"], utc=True, errors="coerce")

# === Step 3. Compute daily return ===
df["change"] = df.groupby("code")["Adj Close"].pct_change()

# === Step 4. Helper functions ===
def extreme_MAD(df: pd.DataFrame, n: float = 3) -> pd.DataFrame:
    median = df.quantile(0.5)
    mad = (df - median).abs().quantile(0.5)
    upper = median + n * mad
    lower = median - n * mad
    return df.clip(lower, upper, axis=1)

def Na_Fill(df: pd.DataFrame, method='ffill') -> pd.DataFrame:
    return df.fillna(method=method)

def Stock_Pool_split(df: pd.DataFrame, min_days: int = 252) -> pd.DataFrame:
    required_cols = ["Open", "High", "Low", "Close", "Adj Close", "Volume"]
    df = df.dropna(subset=required_cols)
    grouped = [g for _, g in df.groupby("code")]
    valid_groups = [g for g in grouped if len(g) >= min_days]
    return pd.concat(valid_groups, ignore_index=True)

def Standard_process(df: pd.DataFrame) -> pd.DataFrame:
    grouped = [g for _, g in df.groupby("stock_code")]
    out = []
    feature_cols = ["open", "high", "low", "close", "factor",
                    "change", "volume", "money"]
    for g in grouped:
        g[feature_cols] = extreme_MAD(g[feature_cols], n=3)
        g[feature_cols] = Na_Fill(g[feature_cols], method='ffill')
        scaler = StandardScaler()
        g[feature_cols] = scaler.fit_transform(g[feature_cols])
        out.append(g)
    return pd.concat(out, ignore_index=True)

# === Step 5. Apply filtering ===
df_filtered = Stock_Pool_split(df)

# === Step 6. Select and rename columns ===
columns = [
    "date", "code",
    "Open", "High", "Low", "Close",
    "Adj Close",   # factor proxy
    "change",      # daily return
    "Volume",      # trading volume
    "MarketCap",   # proxy for trading amount
    "SharesOutstanding"  # outstanding shares
]
df_sel = df_filtered[columns].copy()

df_sel = df_sel.rename(columns={
    "code": "stock_code",
    "Open": "open",
    "High": "high",
    "Low": "low",
    "Close": "close",
    "Adj Close": "factor",
    "change": "change",
    "Volume": "volume",
    "MarketCap": "money",
    "SharesOutstanding": "shares_out"
})

# === Step 7. Standardize modeling features only ===
df_final = Standard_process(df_sel)

# === Step 8. Save normalized feature dataset ===
output_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data"
feature_path = os.path.join(output_dir, "df_sp500.csv")
df_final.to_csv(feature_path, index=False)

print("✅ Features saved:", feature_path)
print("Final dataset shape:", df_final.shape)
print("Unique stocks:", df_final['stock_code'].nunique())

# === Step 9. Generate LABEL0 (1-day forward return, correctly aligned) ===
print("Generating LABEL0 (1-day future return)...")

price_df = df_sel.copy()
price_df = price_df.sort_values(['stock_code', 'date'])

# --- Corrected line ---
price_df['LABEL0'] = (
    price_df.groupby('stock_code')['close']
    .apply(lambda x: x.shift(-1) / x - 1)
    .reset_index(level=0, drop=True)
)

# Pivot to (dates × tickers)
label_df = price_df.pivot(index='date', columns='stock_code', values='LABEL0')

# Align LABEL0 with feature date range (optional)
feature_dates = pd.to_datetime(sorted(df_final['date'].unique()))
label_df = label_df.loc[label_df.index.isin(feature_dates)]

# Important: Do NOT forward-fill the label; keep NaN for last available day
# (Commenting out this line)
# label_df = label_df.fillna(method='ffill')

# Save LABEL0
label_path = os.path.join(output_dir, "LABEL0.csv")
label_df.to_csv(label_path)

print("✅ LABEL0 saved:", label_path)
print("LABEL0 shape:", label_df.shape)




✅ Features saved: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\df_sp500.csv
Final dataset shape: (1231178, 11)
Unique stocks: 503
Generating LABEL0 (1-day future return)...
✅ LABEL0 saved: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\LABEL0.csv
LABEL0 shape: (2515, 503)
