<a href="https://colab.research.google.com/github/maitry-ml/ml-indian-equity-portfolio/blob/main/DSAI_FIN_ASSGN2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install yfinance

# 1. MARKET DATA

In [None]:
import yfinance as yf
import pandas as pd

tickers = ["RELIANCE.NS", "HDFCBANK.NS", "INFY.NS",
           "BHARTIARTL.NS", "HINDUNILVR.NS", "M&M.NS"]

# Download all at once
raw = yf.download(tickers, start="2020-01-01", end="2025-12-31", auto_adjust=True, group_by="ticker")

# Reshape to long format
all_data = []
for ticker in tickers:
    df = raw[ticker].copy()
    df = df.reset_index()
    df["Stock"] = ticker
    df = df[["Date", "Stock", "Open", "High", "Low", "Close", "Volume"]]
    all_data.append(df)

panel = pd.concat(all_data, ignore_index=True)
panel.columns.name = None

# ── Count missing rows BEFORE dropping ──────────────────────────────
missing_rows = panel["Close"].isna().sum()
total_rows   = len(panel)
print(f"Total rows        : {total_rows}")
print(f"Rows with no data : {missing_rows}")
print(f"Per stock breakdown:")
print(panel.groupby("Stock")["Close"].apply(lambda x: x.isna().sum()).rename("Missing Rows"))
print()

# ── Now drop ─────────────────────────────────────────────────────────
market_data = panel.dropna(subset=["Close"])

print(market_data.head(10))
print("Shape after dropping:", market_data.shape)

In [None]:
market_data

# 2. MACRO INDICATORS


In [None]:
# ── STEP 2: Macro Indicators  ─────────────────────────────────
import io
import requests
import yfinance as yf
import pandas as pd
import requests

START = "2019-12-31"
END   = "2025-12-31"
daily_idx = pd.date_range(START, END, freq="D")

# ── 2a. USD-INR and Crude Oil via yfinance ────────────────────────────
macro_tickers = {
    "USDINR"  : "INR=X",
    "CrudeOil": "CL=F",
}

macro_raw = yf.download(
    list(macro_tickers.values()),
    start=START, end=END,
    auto_adjust=True,
    group_by="ticker"
)

macro_frames = []
for col_name, ticker in macro_tickers.items():
    s = macro_raw[ticker]["Close"].copy()
    s.name = col_name
    macro_frames.append(s)

macro_df = pd.concat(macro_frames, axis=1)
macro_df.index = pd.to_datetime(macro_df.index)
macro_df = macro_df.reindex(daily_idx).ffill()

# ── 2b. India 10Y Bond Yield via FRED ───────
fred_url = (
    "https://fred.stlouisfed.org/graph/fredgraph.csv"
    "?id=INDIRLTLT01STM"
)
try:

    resp = requests.get(fred_url)
    in10y = pd.read_csv(io.StringIO(resp.text),
                        parse_dates=["observation_date"],
                        index_col="observation_date")
    in10y.columns = ["IN10Y"]
    in10y = in10y.replace(".", float("nan")).astype(float)
    in10y = in10y.reindex(daily_idx).ffill()
    macro_df["IN10Y"] = in10y["IN10Y"].values
    print("India 10Y loaded from FRED")
except Exception as e:
    print(f"⚠️  FRED fetch failed: {e}")

# ── 2c. CPI (monthly → daily) ─────────────────────────────────────────
cpi_data = {
    "2020-01": 7.59, "2020-02": 6.58, "2020-03": 5.84, "2020-04": 7.22,
    "2020-05": 5.84, "2020-06": 6.09, "2020-07": 6.93, "2020-08": 6.69,
    "2020-09": 7.27, "2020-10": 7.61, "2020-11": 6.93, "2020-12": 4.59,
    "2021-01": 4.06, "2021-02": 5.03, "2021-03": 5.52, "2021-04": 4.23,
    "2021-05": 6.30, "2021-06": 6.26, "2021-07": 5.59, "2021-08": 5.30,
    "2021-09": 4.35, "2021-10": 4.48, "2021-11": 4.91, "2021-12": 5.59,
    "2022-01": 6.01, "2022-02": 6.07, "2022-03": 6.95, "2022-04": 7.79,
    "2022-05": 7.04, "2022-06": 7.01, "2022-07": 6.71, "2022-08": 7.00,
    "2022-09": 7.41, "2022-10": 6.77, "2022-11": 5.88, "2022-12": 5.72,
    "2023-01": 6.52, "2023-02": 6.44, "2023-03": 5.66, "2023-04": 4.70,
    "2023-05": 4.25, "2023-06": 4.81, "2023-07": 7.44, "2023-08": 6.83,
    "2023-09": 5.02, "2023-10": 4.87, "2023-11": 5.55, "2023-12": 5.69,
    "2024-01": 5.10, "2024-02": 5.09, "2024-03": 4.85, "2024-04": 4.83,
    "2024-05": 4.75, "2024-06": 5.08, "2024-07": 3.54, "2024-08": 3.65,
    "2024-09": 5.49, "2024-10": 6.21, "2024-11": 5.48, "2024-12": 5.22,
    "2025-01": 4.26, "2025-02": 3.61, "2025-03": 3.34, "2025-04": 3.16,
    "2025-05": 2.82, "2025-06": 2.10,
}
cpi_series = pd.Series(cpi_data)
cpi_series.index = pd.to_datetime(cpi_series.index)
cpi_daily = cpi_series.reindex(daily_idx).ffill()
macro_df["CPI_YoY"] = cpi_daily.values


# ── 2d. Lagging everything by 1 day (no look-ahead bias) ──────────────────
macro_lagged = macro_df.shift(1)
macro_lagged.index.name = "Date"

macro_lagged = macro_lagged.loc["2020-01-01":]  # dropping 31st DEC

print("\nFinal macro shape:", macro_lagged.shape)


In [None]:
for name, df in [("macro_df", macro_df), ("macro_lagged", macro_lagged)]:
    print(f"{name}")
    print(df.isna().sum().to_frame("  Missing"))
    print(f"Total: {df.isna().sum().sum()} / {df.size} cells\n")

In [None]:
print(macro_df.head())

In [None]:
print(macro_lagged.head())

# 3. FUNDAMENTAL DATA

In [None]:
import pandas as pd
import numpy as np

def load_from_drive(file_id):
    url = f"https://drive.google.com/uc?id={file_id}"
    return pd.read_csv(url)



eps_raw    = load_from_drive("1B1KvYlMUsFyUZeDhQGza759UWh7yUjY4")
ratios_raw = load_from_drive("12yY0yP97bWWJ1FQLV1zFGaU88-7fqZg5")

print(eps_raw.head())

In [None]:
# ── STEP 3a. Align quarterly EPS → daily ─────────────────────────────
all_eps = []

daily_idx_extended = pd.date_range("2019-09-01", "2025-12-31", freq="D")

for ticker in eps_raw["Stock"].unique():
    df = eps_raw[eps_raw["Stock"] == ticker].copy()

    df["Date"] = pd.to_datetime(df["Quarter"], format="%b %Y") \
                   + pd.offsets.MonthEnd(0) \
                   + pd.DateOffset(days=45)

    df = df.set_index("Date")[["EPS"]].sort_index()
    df = pd.to_numeric(df["EPS"], errors="coerce").to_frame()
    df = df.groupby(level=0).last()

    eps_daily = df.reindex(daily_idx_extended).ffill().bfill()
    eps_daily = eps_daily.loc["2020-01-01":"2025-12-31"]
    eps_daily["Stock"] = ticker
    all_eps.append(eps_daily)

eps_panel = pd.concat(all_eps).reset_index()
eps_panel.columns.name = None
eps_panel.rename(columns={"index": "Date"}, inplace=True)
print("✅ EPS aligned:", eps_panel.shape)
print(eps_panel.head())

In [None]:
# ── 3b. Align annual ROE & D/E → daily ───────────────────────────────
all_ratios = []
daily_idx  = pd.date_range("2020-01-01", "2025-12-31", freq="D")

for ticker in ratios_raw["Stock"].unique():
    df = ratios_raw[ratios_raw["Stock"] == ticker].copy()
    df["Date"] = pd.to_datetime(df["Year"].astype(str) + "-03-31")
    df = df.set_index("Date")[["ROE","DebtEquity"]].sort_index()
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.groupby(level=0).last()

    ratios_daily = df.reindex(daily_idx).ffill().bfill()
    ratios_daily["Stock"] = ticker
    all_ratios.append(ratios_daily)

ratios_panel = pd.concat(all_ratios).reset_index()
ratios_panel.columns.name = None
ratios_panel.rename(columns={"index": "Date"}, inplace=True)
print("✅ ROE & D/E aligned:", ratios_panel.shape)

# ── 3c. Merge EPS + ROE + D/E ─────────────────────────────────────────
fundamentals = pd.merge(eps_panel, ratios_panel, on=["Date","Stock"], how="left")
fundamentals.columns.name = None

print("\nShape", fundamentals.shape)
print("\nSample")
print(fundamentals.head())


Jan 2020 → Mar 2021 = 15 months out of 72 months total
= about 21% of the dataset has approximated ROE & D/E

# 4. SENTIMENT DATA

Real-time sentiment is demonstrated using GNews API + FinBERT. Due to historical news API limitations,Sector based Nifty return was was used as sentiment proxy for 2020-2025 backtesting period. All sentiment features were lagged by 1 trading day to prevent look-ahead bias.

In [None]:
# ── Installing required libraries ────────────────────────────────────────
!pip install transformers torch gnews -q

import pandas as pd
import numpy as np
from gnews import GNews
from transformers import pipeline
import time

print("✅ Libraries installed")

In [None]:
# ── Loading FinBERT model ────────────────────────────────────────────────
print("Loading FinBERT... (takes 1-2 minutes first time)")
finbert = pipeline(
    "text-classification",
    model="ProsusAI/finbert",
    return_all_scores=True
)
print("✅ FinBERT loaded")

In [None]:
# ── Stock search terms ────────────────────────────────────────────────
stock_queries = {
    "RELIANCE.NS"   : "Reliance Industries stock",
    "HDFCBANK.NS"   : "HDFC Bank stock",
    "INFY.NS"       : "Infosys stock",
    "M&M.NS"        : "Mahindra Mahindra stock",
    "BHARTIARTL.NS" : "Bharti Airtel stock",
    "HINDUNILVR.NS" : "Hindustan Unilever HUL stock"
}

In [None]:
# ──scoring function ────────────────────────────────────────────
def get_finbert_score(headlines):
    if not headlines:
        return 0.0

    scores = []
    for headline in headlines:
        try:
            result = finbert(headline[:512])
            # result = [{'label': 'positive', 'score': 0.75}]
            label = result[0]["label"]
            score = result[0]["score"]


            if label == "positive":
                net = score
            elif label == "negative":
                net = -score
            else:  # neutral
                net = 0.0

            scores.append(net)
        except Exception as e:
            print(f"  ⚠️ Skipping: {e}")
            continue

    return round(float(np.mean(scores)), 4) if scores else 0.0

In [None]:
# ── fetching news WITH dates ────────────────────────────────────
real_sentiment = []

for ticker, query in stock_queries.items():
    print(f"\nFetching news for {ticker}...")

    try:
        articles = google_news.get_news(query)

        if not articles:
            print(f"  No news found for {ticker}")
            continue

        # ── Extract headline AND date per article ─────────────────────
        dated_headlines = []
        for article in articles:
            try:
                title     = article["title"]
                pub_date  = pd.to_datetime(article["published date"]).normalize()
                dated_headlines.append((pub_date, title))
                print(f"  [{pub_date.date()}] {title[:60]}...")
            except:
                continue

        # ── Grouping headlines by date ───────────────────────────────────
        from collections import defaultdict
        date_groups = defaultdict(list)
        for pub_date, title in dated_headlines:
            date_groups[pub_date].append(title)

        # ── Scoring each date separately ────────────────────────────────
        for pub_date, headlines in date_groups.items():
            score = get_finbert_score(headlines)
            real_sentiment.append({
                "Date"      : pub_date,
                "Stock"     : ticker,
                "Sentiment" : score,
                "Headlines" : len(headlines)
            })
            print(f"  {pub_date.date()} → {len(headlines)} headlines → score: {score}")

        time.sleep(2)

    except Exception as e:
        print(f"  Error: {e}")

real_sentiment_df = pd.DataFrame(real_sentiment)
real_sentiment_df = real_sentiment_df.sort_values(["Stock","Date"])



In [None]:
real_sentiment_df = real_sentiment_df.sort_values(["Stock","Date"]).reset_index(drop=True)
print(real_sentiment_df)

## **Proxy sentiment for Historical Data**

 Due to historical news API limitations, Nifty50 return-based sentiment proxy is used for 2020-2025 backtesting period. All sentiment features are lagged by 1 trading day to prevent look-ahead bias.

In [None]:
# ── Stock-Specific Sentiment using Sector Indices ─────────────────────
import yfinance as yf
import pandas as pd
import numpy as np

daily_idx = pd.date_range("2020-01-01", "2025-12-31", freq="D")

# ── Sector index for each stock ───────────────────────────────────────
sector_indices = {
    "RELIANCE.NS"   : "^CNXENERGY",
    "HDFCBANK.NS"   : "^NSEBANK",
    "INFY.NS"       : "^CNXIT",
    "M&M.NS"        : "^CNXAUTO",
    "BHARTIARTL.NS" : "^CNXMEDIA",
    "HINDUNILVR.NS" : "^CNXFMCG"
}

def compute_sentiment(ticker):
    """Fetching sector index and converting returns to sentiment score."""
    try:
        data = yf.download(ticker, start="2019-12-31",
                           end="2025-12-31", auto_adjust=True)
        data.columns = data.columns.get_level_values(0)

        # Calculating returns on trading days only
        returns = data["Close"].pct_change().dropna()

        # Clipping and normalize to -1 to +1
        std     = float(returns.std())
        clipped = returns.clip(-3*std, 3*std)
        proxy   = (clipped / (3*std)).fillna(0)

        # Reindexing to calendar days and forward fill weekends/holidays
        proxy.index = pd.to_datetime(proxy.index)
        proxy = proxy.reindex(daily_idx).ffill().fillna(0)

        return proxy

    except Exception as e:
        print(f" Failed for {ticker}: {e} ")
        return None


# ── Computing sentiment per stock ───────────────────────────────────────
proxy_frames = []

for stock_ticker, sector_ticker in sector_indices.items():
    print(f"\nFetching sector index {sector_ticker} for {stock_ticker}...")

    proxy = compute_sentiment(sector_ticker)


    df = pd.DataFrame({
        "Date"      : daily_idx,
        "Stock"     : stock_ticker,
        "Sentiment" : proxy.values
    })
    proxy_frames.append(df)


sentiment_panel = pd.concat(proxy_frames, ignore_index=True)

# ── Lag by 1 day per stock ────────────────────────────────────────────
sentiment_panel = sentiment_panel.sort_values(["Stock","Date"])
sentiment_panel["Sentiment"] = sentiment_panel.groupby("Stock")["Sentiment"].shift(1)
sentiment_panel = sentiment_panel.dropna(subset=["Sentiment"])
sentiment_panel["Date"] = pd.to_datetime(sentiment_panel["Date"])
sentiment_panel = sentiment_panel.reset_index(drop=True)






In [None]:
# ── Verify different scores per stock ────────────────────────────────
print("\nSector-Specific Sentiment")
print("\nShape:", sentiment_panel.shape)
print("\nSame date different stocks should have DIFFERENT values:")
print(sentiment_panel[sentiment_panel["Date"] == "2020-01-03"][["Date","Stock","Sentiment"]])
print("\nSentiment range:")
print(sentiment_panel.groupby("Stock")["Sentiment"].describe())

In [None]:
sentiment_csv = sentiment_panel[["Date", "Stock", "Sentiment"]].copy()
sentiment_csv = sentiment_csv.sort_values(["Stock", "Date"]).reset_index(drop=True)

print("=== Sentiment CSV Sample ===")
print(sentiment_csv.head(10))
print("\nShape:", sentiment_csv.shape)

# Save to CSV
sentiment_csv.to_csv("sentiment_scores.csv", index=False)
print("✅ Saved as sentiment_scores.csv")

In [None]:
from google.colab import files
files.download("sentiment_scores.csv")


# 5. Feature Engineering

In [None]:
# ── STEP 5: Feature Engineering ───────────────────────────────────────
!pip install ta -q

import pandas as pd
import numpy as np
import ta
from sklearn.preprocessing import RobustScaler

# ── 5a. Start with market OHLCV panel ────────────────────────────────
# 'market_data' is  OHLCV dataframe from Step 1
# Make sure it has: Date, Stock, Open, High, Low, Close, Volume
print("Market panel shape:", market_data.shape)
print(panel.head())

In [None]:
# ── STEP 5b: Essential Features Only ─────────────────────────────────
feature_frames = []

for ticker in market_data["Stock"].unique():
    print(f"Processing {ticker}...")
    df = market_data[market_data["Stock"] == ticker].copy()
    df = df.sort_values("Date").reset_index(drop=True)

    # ── Log Return ────────────────────────────────────────────────────
    df["LogReturn"]   = np.log(df["Close"] / df["Close"].shift(1))

    # ── Lagged Return ─────────────────────────────────────────────────
    df["Return_lag1"] = df["LogReturn"].shift(1)

    # ── RSI ───────────────────────────────────────────────────────────
    df["RSI14"]       = ta.momentum.RSIIndicator(
                            close=df["Close"], window=14).rsi()

    # ── Price to SMA20 ratio ──────────────────────────────────────────
    df["SMA20"]             = df["Close"].rolling(20).mean()
    df["Price_SMA20_ratio"] = df["Close"] / df["SMA20"]

    # ── Volatility ────────────────────────────────────────────────────
    df["Volatility20"] = df["LogReturn"].rolling(20).std()

    # ── Volume change ────────────────────────────────────────────────
    df["LogVolume"] = np.log(df["Volume"].replace(0, np.nan) / df["Volume"].replace(0, np.nan).shift(1)
)

    # ── Target: next day log return ───────────────────────────────────
    df["Target"]       = df["LogReturn"].shift(-1)

    # Drop SMA20 raw
    df = df.drop(columns=["SMA20"])

    feature_frames.append(df)

features = pd.concat(feature_frames, ignore_index=True)
print("\n✅ Technical features shape:", features.shape)
print(features.columns.tolist())

# MERGING

In [None]:
# ── 5c. Merge all data sources ────────────────────────────────────────
features["Date"]        = pd.to_datetime(features["Date"])
macro_lagged.index      = pd.to_datetime(macro_lagged.index)
fundamentals["Date"]    = pd.to_datetime(fundamentals["Date"])
sentiment_panel["Date"] = pd.to_datetime(sentiment_panel["Date"])

# Reset macro index to merge on Date
macro_reset = macro_lagged.reset_index()

# ── Merge step by step ────────────────────────────────────────────────
master = features.copy()

# Merge macro (on Date only — same for all stocks)
master = pd.merge(master, macro_reset,
                  on="Date", how="left")

# Merge fundamentals (on Date + Stock)
master = pd.merge(master, fundamentals[["Date","Stock","EPS","ROE","DebtEquity"]],
                  on=["Date","Stock"], how="left")

# Merge sentiment (on Date + Stock)
master = pd.merge(master, sentiment_panel[["Date","Stock","Sentiment"]],
                  on=["Date","Stock"], how="left")

# ── Calculate P/E ratio ───────────────────────────────────────────────
master["PE_ratio"] = master["Close"] / master["EPS"]
master["PE_ratio"] = master["PE_ratio"].where(master["EPS"] > 0, other=np.nan)

print("=== Master Panel ===")
print("Shape:", master.shape)
print("\nColumns:", master.columns.tolist())
print("\nMissing values:")
print(master.isna().sum())


In [None]:
master.head()

In [None]:
# ── 5d. Handle Missing Values ─────────────────────────────────────────

# PE_ratio: NaN when EPS negative → forward fill per stock
master["PE_ratio"] = master.groupby("Stock")["PE_ratio"].ffill().bfill()

# Drop rows where technical indicators are NaN
# (first ~20 rows per stock due to rolling windows)
master = master.dropna(subset=["LogReturn","RSI14","Volatility20",
                                "Price_SMA20_ratio","Target"])

print("Shape after cleaning:", master.shape)
print("\nMissing values:")
print(master.isna().sum())

In [None]:
# ── Find days with zero or missing volume ─────────────────────────────
zero_vol = master[master["LogVolume"].isna()][["Date","Stock","Volume","LogVolume"]]
print("Days with missing LogVolume:")
print(zero_vol[["Date","Stock","Volume"]])

In [None]:
# Forward fill LogVolume for holiday/zero volume days
master["LogVolume"] = master.groupby("Stock")["LogVolume"].ffill().bfill()

# Verify
print("Missing values after fix:")
print(master[["LogVolume"]].isna().sum())
print("\nFinal master shape:", master.shape)

In [None]:
# ── 5e. Robust Scaling ────────────────────────────────────────────────
from sklearn.preprocessing import RobustScaler

# Columns to scale — exclude identifiers and target
cols_to_scale = ["LogReturn","Return_lag1","RSI14","Price_SMA20_ratio",
                 "Volatility20","LogVolume","USDINR","CrudeOil",
                 "IN10Y","CPI_YoY","EPS","ROE","DebtEquity",
                 "Sentiment","PE_ratio"]

# ── 5f. Train / Test Split BEFORE scaling (no look-ahead bias) ────────
train = master[master["Date"] <  "2025-10-01"].copy()
test  = master[master["Date"] >= "2025-10-01"].copy()

print(f"Train: {train.shape} — {train['Date'].min().date()} to {train['Date'].max().date()}")
print(f"Test:  {test.shape}  — {test['Date'].min().date()} to {test['Date'].max().date()}")

# ── Fit scaler on TRAIN only, transform both ──────────────────────────
scaler = RobustScaler()
train[cols_to_scale] = scaler.fit_transform(train[cols_to_scale])
test[cols_to_scale]  = scaler.transform(test[cols_to_scale])

print("\n✅ Scaling done")

train[cols_to_scale].head()

#  6.MODEL TRAINING + VALIDATION

In [None]:
# ── STEP 6: ML Model + Walk-Forward Validation ────────────────────────
!pip install lightgbm -q

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings("ignore")

# ── Feature and target columns ────────────────────────────────────────
feature_cols = ["LogReturn","Return_lag1","RSI14","Price_SMA20_ratio",
                "Volatility20","LogVolume","USDINR","CrudeOil",
                "IN10Y","CPI_YoY","EPS","ROE","DebtEquity",
                "Sentiment","PE_ratio"]

target_col = "Target"

# ── 6a. Define Walk-Forward Folds ─────────────────────────────────────
train["Date"] = pd.to_datetime(train["Date"])

# Generate 6-month validation folds with minimum 12 months training
folds      = []
val_start  = pd.Timestamp("2021-01-01")  # start after 12 months training
fold_end   = pd.Timestamp("2025-09-30")

while val_start + pd.DateOffset(months=6) <= fold_end:
    val_end = val_start + pd.DateOffset(months=6)
    folds.append({
        "train_end"  : val_start,
        "val_start"  : val_start,
        "val_end"    : val_end
    })
    val_start = val_end

print(f"Total folds: {len(folds)}")
for i, f in enumerate(folds):
    print(f"Fold {i+1}: Train → {f['train_end'].date()} | "
          f"Val {f['val_start'].date()} → {f['val_end'].date()}")

In [None]:
# ── 6b. RFE Feature Selection ─────────────────────────────────────────
print("\nRunning RFE to select best features...")

# Use Ridge regression for RFE (fast and stable)
X_rfe = train[feature_cols].values
y_rfe = train[target_col].values

ridge   = Ridge(alpha=1.0)
rfe     = RFE(estimator=ridge, n_features_to_select=10, step=1)
rfe.fit(X_rfe, y_rfe)

selected_features = [f for f, s in zip(feature_cols, rfe.support_) if s]
dropped_features  = [f for f, s in zip(feature_cols, rfe.support_) if not s]

print(f"\n✅ Selected features ({len(selected_features)}):")
print(selected_features)
print(f"\n❌ Dropped features ({len(dropped_features)}):")
print(dropped_features)


In [None]:
# ── 6c. Walk-Forward Training Loop ───────────────────────────────────
fold_results = []

lgb_params = {
    "objective"        : "regression",
    "metric"           : "rmse",
    "learning_rate"    : 0.05,
    "num_leaves"       : 31,
    "min_child_samples": 20,
    "feature_fraction" : 0.8,
    "bagging_fraction" : 0.8,
    "bagging_freq"     : 5,
    "reg_alpha"        : 0.1,   # L1 regularization
    "reg_lambda"       : 0.1,   # L2 regularization
    "verbose"          : -1
}

for i, fold in enumerate(folds):
    # ── Split data ────────────────────────────────────────────────────
    fold_train = train[train["Date"] <  fold["train_end"]]
    fold_val   = train[(train["Date"] >= fold["val_start"]) &
                       (train["Date"] <  fold["val_end"])]

    if len(fold_train) < 100 or len(fold_val) < 10:
        continue

    X_train = fold_train[selected_features].values
    y_train = fold_train[target_col].values
    X_val   = fold_val[selected_features].values
    y_val   = fold_val[target_col].values

    # ── Train LightGBM ────────────────────────────────────────────────
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval   = lgb.Dataset(X_val,   label=y_val, reference=dtrain)

    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round      = 500,
        valid_sets           = [dval],
        callbacks            = [lgb.early_stopping(50, verbose=False),
                                 lgb.log_evaluation(period=-1)]
    )

    # ── Predict and evaluate ──────────────────────────────────────────
    y_pred = model.predict(X_val)

    rmse    = np.sqrt(mean_squared_error(y_val, y_pred))
    mae     = mean_absolute_error(y_val, y_pred)
    dir_acc = np.mean(np.sign(y_pred) == np.sign(y_val)) * 100

    # ── Per stock directional accuracy ────────────────────────────────
    fold_val         = fold_val.copy()
    fold_val["Pred"] = y_pred
    per_stock_acc    = fold_val.groupby("Stock").apply(
        lambda x: np.mean(np.sign(x["Pred"]) == np.sign(x["Target"])) * 100
    ).round(2)

    fold_results.append({
        "Fold"      : i + 1,
        "Val_Start" : fold["val_start"].date(),
        "Val_End"   : fold["val_end"].date(),
        "RMSE"      : round(rmse, 6),
        "MAE"       : round(mae, 6),
        "DirAcc"    : round(dir_acc, 2),
        "PerStock"  : per_stock_acc
    })

    print(f"Fold {i+1} | {fold['val_start'].date()} → {fold['val_end'].date()} | "
          f"RMSE={rmse:.6f} | MAE={mae:.6f} | DirAcc={dir_acc:.2f}%")

In [None]:
# ── 6d. Summary of Walk-Forward Results ──────────────────────────────
results_df = pd.DataFrame([{
    "Fold"      : r["Fold"],
    "Val_Start" : r["Val_Start"],
    "Val_End"   : r["Val_End"],
    "RMSE"      : r["RMSE"],
    "MAE"       : r["MAE"],
    "DirAcc"    : r["DirAcc"]
} for r in fold_results])

print("\n=== Walk-Forward Validation Results ===")
print(results_df.to_string(index=False))
print(f"\nAverage RMSE : {results_df['RMSE'].mean():.6f}")
print(f"Average MAE  : {results_df['MAE'].mean():.6f}")
print(f"Average DirAcc: {results_df['DirAcc'].mean():.2f}%")

# ── Per stock average directional accuracy ────────────────────────────
print("\n=== Per Stock Directional Accuracy (avg across folds) ===")
all_stock_acc = pd.concat([r["PerStock"] for r in fold_results], axis=1)
all_stock_acc.columns = [f"Fold{r['Fold']}" for r in fold_results]
all_stock_acc["Average"] = all_stock_acc.mean(axis=1).round(2)
print(all_stock_acc)

In [None]:
# ── 6e. Train Final Model on Full Training Data ───────────────────────
print("\nTraining final model on full training data...")

X_train_full = train[selected_features].values
y_train_full = train[target_col].values

dtrain_full = lgb.Dataset(X_train_full, label=y_train_full)

final_model = lgb.train(
    lgb_params,
    dtrain_full,
    num_boost_round = 500,
    callbacks       = [lgb.log_evaluation(period=-1)]
)

# ── Predict on test set (Oct-Dec 2025) ────────────────────────────────
X_test  = test[selected_features].values
y_test  = test[target_col].values
y_test_pred = final_model.predict(X_test)

test = test.copy()
test["Predicted_Return"] = y_test_pred

# ── Test set metrics ──────────────────────────────────────────────────
test_rmse    = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae     = mean_absolute_error(y_test, y_test_pred)
test_dir_acc = np.mean(np.sign(y_test_pred) == np.sign(y_test)) * 100

print("\n=== Final Test Set Results (Oct-Dec 2025) ===")
print(f"RMSE            : {test_rmse:.6f}")
print(f"MAE             : {test_mae:.6f}")
print(f"Directional Acc : {test_dir_acc:.2f}%")

print("\n=== Per Stock Test Accuracy ===")
test_stock_acc = test.groupby("Stock").apply(
    lambda x: np.mean(np.sign(x["Predicted_Return"]) == np.sign(x["Target"])) * 100
).round(2)
print(test_stock_acc)

# STEP 7. PORTFOLIO

In [None]:
# ── STEP 7: Complete Portfolio Construction ───────────────────────────
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import yfinance as yf

# ── 7a. Pivot to wide format ──────────────────────────────────────────
pred_wide   = test.pivot(index="Date", columns="Stock",
                          values="Predicted_Return")
actual_wide = test.pivot(index="Date", columns="Stock",
                          values="Target")

stocks_in_test = actual_wide.columns.tolist()
print("Stocks in portfolio:", stocks_in_test)
print("Trading days:", len(pred_wide))

# ── 7b. Dynamic weights from predicted returns ────────────────────────
pred_clipped  = pred_wide.clip(lower=0)
daily_weights = pred_clipped.div(
    pred_clipped.sum(axis=1).replace(0, np.nan), axis=0
).fillna(1/6)

print("\n=== Sample Daily Weights ===")
print(daily_weights.head(5).round(4))
print("\nWeight sum per day (should all be 1.0):")
print(daily_weights.sum(axis=1).head(5).round(4))





In [None]:
# ── 7c. Portfolio returns ─────────────────────────────────────────────
portfolio_returns = (actual_wide * daily_weights).sum(axis=1)

print("\n=== Portfolio Daily Returns ===")
print(portfolio_returns.head(10))
print(f"\nMean daily return : {portfolio_returns.mean():.6f}")
print(f"Std daily return  : {portfolio_returns.std():.6f}")

# ── 7d. Performance metrics ───────────────────────────────────────────
trading_days   = 252
risk_free_rate = 0.065 / trading_days

# Sharpe Ratio
sharpe = ((portfolio_returns.mean() - risk_free_rate) /
           portfolio_returns.std()) * np.sqrt(trading_days)

# Equity curve
cum_returns = (1 + portfolio_returns).cumprod()

# Maximum Drawdown
rolling_max = cum_returns.cummax()
drawdown    = (cum_returns - rolling_max) / rolling_max
mdd         = drawdown.min()

# Hit Ratio
hit_ratio = (portfolio_returns > 0).mean() * 100

# Annualized metrics
ann_return = portfolio_returns.mean() * trading_days * 100
ann_vol    = portfolio_returns.std() * np.sqrt(trading_days) * 100

print("\n=== Portfolio Performance Metrics ===")
print(f"Annualized Return     : {ann_return:.2f}%")
print(f"Annualized Volatility : {ann_vol:.2f}%")
print(f"Sharpe Ratio          : {sharpe:.4f}")
print(f"Maximum Drawdown      : {mdd*100:.2f}%")
print(f"Hit Ratio             : {hit_ratio:.2f}%")

In [None]:
# ── Accuracy-Based Weights (HUL = 0) ─────────────────────────────────
test_accuracy = test.groupby("Stock").apply(
    lambda x: np.mean(np.sign(x["Predicted_Return"]) == np.sign(x["Target"])) * 100
).sort_values(ascending=False)

print("=== Per Stock Test Accuracy ===")
print(test_accuracy)

# ── Assign weights in order of accuracy, HUL = 0 ─────────────────────
custom_acc_weights = pd.Series({
    "BHARTIARTL.NS" : 0.30,   # 60.00% — best
    "M&M.NS"        : 0.25,   # 55.00% — second
    "RELIANCE.NS"   : 0.20,   # 53.33% — third
    "HDFCBANK.NS"   : 0.13,   # 51.67% — fourth
    "INFY.NS"       : 0.12,   # 51.67% — fifth
    "HINDUNILVR.NS" : 0.00,   # 36.67% — zero
})

print("\n=== Accuracy-Based Weights (HUL=0) ===")
for stock, w in custom_acc_weights.items():
    acc = test_accuracy[stock]
    print(f"  {stock:20s} → {w*100:.0f}%  (accuracy: {acc:.2f}%)")
print(f"\nTotal: {custom_acc_weights.sum():.2f}")

# ── Portfolio returns ─────────────────────────────────────────────────
w_acc = custom_acc_weights[stocks_in_test]
w_acc = w_acc / w_acc.sum()  # renormalize to 1

portfolio_returns_acc = actual_wide.dot(w_acc)

# ── Metrics ───────────────────────────────────────────────────────────
sharpe_acc  = ((portfolio_returns_acc.mean() - risk_free_rate) /
                portfolio_returns_acc.std()) * np.sqrt(trading_days)
cum_acc     = (1 + portfolio_returns_acc).cumprod()
mdd_acc     = ((cum_acc - cum_acc.cummax()) / cum_acc.cummax()).min()
hit_acc     = (portfolio_returns_acc > 0).mean() * 100
ann_ret_acc = portfolio_returns_acc.mean() * trading_days * 100

print("\n=== Accuracy-Based Portfolio Metrics ===")
print(f"Annualized Return     : {ann_ret_acc:.2f}%")
print(f"Sharpe Ratio          : {sharpe_acc:.4f}")
print(f"Max Drawdown          : {mdd_acc*100:.2f}%")
print(f"Hit Ratio             : {hit_acc:.2f}%")

# ── Compare both methods ──────────────────────────────────────────────
print("\n=== Dynamic vs Accuracy-Based ===")
print(f"{'Metric':<25} {'Dynamic':>12} {'Acc-Based':>12}")
print("-" * 50)
print(f"{'Annualized Return':<25} {ann_return:>11.2f}% {ann_ret_acc:>11.2f}%")
print(f"{'Sharpe Ratio':<25} {sharpe:>12.4f} {sharpe_acc:>12.4f}")
print(f"{'Max Drawdown':<25} {mdd*100:>11.2f}% {mdd_acc*100:>11.2f}%")
print(f"{'Hit Ratio':<25} {hit_ratio:>11.2f}% {hit_acc:>11.2f}%")

In [None]:
# ── Static Portfolio — Average Predicted Return ───────────────────────

# Step 1: Calculate average predicted return per stock over test period
avg_predicted = pred_wide.mean()
print("=== Average Predicted Return per Stock ===")
print(avg_predicted.sort_values(ascending=False))

# Step 2: Clip negative predictions to 0
avg_predicted_clipped = avg_predicted.clip(lower=0)

# Step 3: Normalize to sum to 1
static_weights = avg_predicted_clipped / avg_predicted_clipped.sum()

print("\n=== Static Weights (from avg predicted return) ===")
for stock, w in static_weights.sort_values(ascending=False).items():
    print(f"  {stock:20s} → {w*100:.2f}%")
print(f"\nTotal: {static_weights.sum():.4f}")

# Step 4: Apply fixed weights to actual returns every day
portfolio_returns_static = actual_wide.dot(static_weights)

# Step 5: Metrics
sharpe_static  = ((portfolio_returns_static.mean() - risk_free_rate) /
                   portfolio_returns_static.std()) * np.sqrt(252)
cum_static     = (1 + portfolio_returns_static).cumprod()
mdd_static     = ((cum_static - cum_static.cummax()) /
                   cum_static.cummax()).min()
hit_static     = (portfolio_returns_static > 0).mean() * 100
ann_ret_static = portfolio_returns_static.mean() * 252 * 100

print("\n=== Static Portfolio Metrics ===")
print(f"Annualized Return : {ann_ret_static:.2f}%")
print(f"Sharpe Ratio      : {sharpe_static:.4f}")
print(f"Max Drawdown      : {mdd_static*100:.2f}%")
print(f"Hit Ratio         : {hit_static:.2f}%")
print(f"Ending Value      : ₹{cum_static.iloc[-1]:.4f}")


In [None]:
# ── Static approach-2 (Hybrid)───────────────────────────────────────────────────
avg_predicted = pred_wide.mean()

# Zero out stocks with accuracy below 50%
bad_stocks = ["HINDUNILVR.NS"]  # accuracy 36.67%
avg_predicted[bad_stocks] = 0

# Clip remaining negatives and normalize
avg_clipped = avg_predicted.clip(lower=0)
hybrid_weights = avg_clipped / avg_clipped.sum()

print("=== Hybrid Weights ===")
for stock, w in hybrid_weights.sort_values(ascending=False).items():
    print(f"  {stock:20s} → {w*100:.2f}%")

# Portfolio returns
portfolio_returns_hybrid = actual_wide.dot(hybrid_weights)

# Metrics
sharpe_hybrid  = ((portfolio_returns_hybrid.mean() - risk_free_rate) /
                   portfolio_returns_hybrid.std()) * np.sqrt(252)
cum_hybrid     = (1 + portfolio_returns_hybrid).cumprod()
mdd_hybrid     = ((cum_hybrid - cum_hybrid.cummax()) /
                   cum_hybrid.cummax()).min()
hit_hybrid     = (portfolio_returns_hybrid > 0).mean() * 100
ann_ret_hybrid = portfolio_returns_hybrid.mean() * 252 * 100

print("\n=== Hybrid Portfolio Metrics ===")
print(f"Annualized Return : {ann_ret_hybrid:.2f}%")
print(f"Sharpe Ratio      : {sharpe_hybrid:.4f}")
print(f"Max Drawdown      : {mdd_hybrid*100:.2f}%")
print(f"Hit Ratio         : {hit_hybrid:.2f}%")

In [None]:
# ── 7e. Benchmark — Nifty50 ──────────────────────────────────────────
nifty_test = yf.download("^NSEI", start="2025-10-01",
                          end="2025-12-31", auto_adjust=True)
nifty_test.columns = nifty_test.columns.get_level_values(0)
nifty_ret  = nifty_test["Close"].pct_change().dropna()
nifty_ret.index = pd.to_datetime(nifty_ret.index)

# Align dates
common_dates  = portfolio_returns.index.intersection(nifty_ret.index)
port_aligned  = portfolio_returns.loc[common_dates]
nifty_aligned = nifty_ret.loc[common_dates]

# Nifty metrics
nifty_sharpe  = ((nifty_aligned.mean() - risk_free_rate) /
                  nifty_aligned.std()) * np.sqrt(trading_days)
nifty_cum     = (1 + nifty_aligned).cumprod()
nifty_mdd     = ((nifty_cum - nifty_cum.cummax()) /
                  nifty_cum.cummax()).min()
nifty_hit     = (nifty_aligned > 0).mean() * 100
nifty_ann_ret = nifty_aligned.mean() * trading_days * 100

# ── Full comparison including Accuracy-Based ──────────────────────────
print("\n=== All Methods Comparison ===")
print(f"{'Metric':<25} {'Dynamic':>10} {'Static':>10} {'Acc-Based':>10} {'Hybrid':>10} {'Nifty50':>10}")
print("-" * 78)
print(f"{'Annualized Return':<25} {ann_return:>9.2f}% {ann_ret_static:>9.2f}% {ann_ret_acc:>9.2f}% {ann_ret_hybrid:>9.2f}% {nifty_ann_ret:>9.2f}%")
print(f"{'Sharpe Ratio':<25} {sharpe:>10.4f} {sharpe_static:>10.4f} {sharpe_acc:>10.4f} {sharpe_hybrid:>10.4f} {nifty_sharpe:>10.4f}")
print(f"{'Max Drawdown':<25} {mdd*100:>9.2f}% {mdd_static*100:>9.2f}% {mdd_acc*100:>9.2f}% {mdd_hybrid*100:>9.2f}% {nifty_mdd*100:>9.2f}%")
print(f"{'Hit Ratio':<25} {hit_ratio:>9.2f}% {hit_static:>9.2f}% {hit_acc:>9.2f}% {hit_hybrid:>9.2f}% {nifty_hit:>9.2f}%")

Going ahead with dynamic portfolio ratio allocation

# Comparing to NIFTY

In [None]:
print("\n=== Benchmark Comparison ===")
print(f"{'Metric':<25} {'Portfolio':>12} {'Nifty50':>12}")
print("-" * 50)
print(f"{'Annualized Return':<25} {ann_return:>11.2f}% {nifty_ann_ret:>11.2f}%")
print(f"{'Sharpe Ratio':<25} {sharpe:>12.4f} {nifty_sharpe:>12.4f}")
print(f"{'Max Drawdown':<25} {mdd*100:>11.2f}% {nifty_mdd*100:>11.2f}%")
print(f"{'Hit Ratio':<25} {hit_ratio:>11.2f}% {nifty_hit:>11.2f}%")

# ── 7f. Equity Curve ─────────────────────────────────────────────────
port_cum_aligned  = (1 + port_aligned).cumprod()
nifty_cum_aligned = (1 + nifty_aligned).cumprod()
drawdown_aligned  = (port_cum_aligned -
                     port_cum_aligned.cummax()) / port_cum_aligned.cummax()

fig, axes = plt.subplots(3, 1, figsize=(14, 12))
fig.suptitle("Portfolio Performance: Oct - Dec 2025",
             fontsize=16, fontweight="bold")

# Plot 1: Equity Curve
axes[0].plot(port_cum_aligned.index, port_cum_aligned.values,
             color="blue", linewidth=2, label="ML Portfolio")
axes[0].plot(nifty_cum_aligned.index, nifty_cum_aligned.values,
             color="orange", linewidth=2, linestyle="--", label="Nifty50")
axes[0].fill_between(port_cum_aligned.index,
                     port_cum_aligned.values,
                     nifty_cum_aligned.values,
                     where=port_cum_aligned.values >= nifty_cum_aligned.values,
                     alpha=0.2, color="green", label="Outperforming")
axes[0].fill_between(port_cum_aligned.index,
                     port_cum_aligned.values,
                     nifty_cum_aligned.values,
                     where=port_cum_aligned.values < nifty_cum_aligned.values,
                     alpha=0.2, color="red", label="Underperforming")
axes[0].set_title("Equity Curve (₹1 invested at start)", fontsize=13)
axes[0].set_ylabel("Portfolio Value (₹)")
axes[0].legend(loc="upper left")
axes[0].grid(True, alpha=0.3)
axes[0].xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

# Plot 2: Drawdown
axes[1].fill_between(drawdown_aligned.index,
                     drawdown_aligned.values * 100,
                     0, color="red", alpha=0.5)
axes[1].plot(drawdown_aligned.index,
             drawdown_aligned.values * 100,
             color="darkred", linewidth=1)
axes[1].axhline(y=mdd * 100, color="black",
                linestyle="--", linewidth=0.8,
                label=f"Max DD: {mdd*100:.2f}%")
axes[1].set_title("Portfolio Drawdown", fontsize=13)
axes[1].set_ylabel("Drawdown (%)")
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

# Plot 3: Daily Returns
colors_bar = ["#2ecc71" if r > 0 else "#e74c3c"
              for r in port_aligned.values]
axes[2].bar(port_aligned.index, port_aligned.values * 100,
            color=colors_bar, alpha=0.8, width=1)
axes[2].axhline(y=0, color="black", linewidth=0.8)
axes[2].axhline(y=port_aligned.mean() * 100,
                color="blue", linestyle="--", linewidth=1,
                label=f"Mean: {port_aligned.mean()*100:.3f}%")
axes[2].set_title("Daily Portfolio Returns", fontsize=13)
axes[2].set_ylabel("Daily Return (%)")
axes[2].legend()
axes[2].grid(True, alpha=0.3)
axes[2].xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

plt.tight_layout()
plt.savefig("equity_curve.png", dpi=150, bbox_inches="tight")
plt.show()
print("Equity curve saved!")

# ── Final Summary ───────────────────────────────────────────────────────────
print(f"\n=== Final Summary ===")
print(f"Starting Value  : ₹1.0000")
print(f"Ending Value    : ₹{port_cum_aligned.iloc[-1]:.4f}")
print(f"Total Return    : {(port_cum_aligned.iloc[-1]-1)*100:.2f}%")
print(f"Nifty Return    : {(nifty_cum_aligned.iloc[-1]-1)*100:.2f}%")
print(f"Alpha vs Nifty  : {((port_cum_aligned.iloc[-1]-nifty_cum_aligned.iloc[-1])*100):.2f}%")

In [None]:
# ── Feature Importance ────────────────────────────────────────────────
importance = final_model.feature_importance(importance_type="gain")
importance_df = pd.DataFrame({
    "Feature"   : selected_features,
    "Importance": importance
}).sort_values("Importance", ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="steelblue")
plt.title("Feature Importance (LightGBM — Gain)", fontsize=14)
plt.xlabel("Importance Score")
plt.tight_layout()
plt.savefig("feature_importance.png", dpi=150, bbox_inches="tight")
plt.show()
print("✅ Saved as feature_importance.png")