In [None]:
pip install yfinance pandas numpy scipy statsmodels tqdm newsapi gnews

Collecting gnews
  Downloading gnews-0.4.2-py3-none-any.whl.metadata (19 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dnspython (from gnews)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading gnews-0.4.2-py3-none-any.whl (18 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=s

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from tqdm import tqdm

# ------------------------------ Fundamentals ------------------------------- #
def fetch_fundamentals(ticker: str) -> dict:
    """Fetch multiple fundamental ratios from Yahoo Finance."""
    try:
        info = yf.Ticker(ticker).info
        return {
            "de_ratio": info.get("debtToEquity"),         # Debt-to-Equity
            "current_ratio": info.get("currentRatio"),    # Current Ratio
            "quick_ratio": info.get("quickRatio"),        # Quick Ratio
            "roa": info.get("returnOnAssets"),            # Return on Assets
            "roe": info.get("returnOnEquity"),            # Return on Equity
            "profit_margin": info.get("profitMargins"),   # Profit Margin
        }
    except Exception as e:
        print(f"Error fetching fundamentals for {ticker}: {e}")
        return {
            "de_ratio": None,
            "current_ratio": None,
            "quick_ratio": None,
            "roa": None,
            "roe": None,
            "profit_margin": None,
        }

# ---------------------------- Feature Engineering --------------------------- #
def compute_simple_features(df: pd.DataFrame, ticker: str, fundamentals: dict) -> pd.DataFrame:
    """Compute volatility, drawdowns, returns, and attach fundamentals."""
    df = df.copy().sort_index()
    close = df["Adj Close"]

    feats = pd.DataFrame(index=df.index)
    feats["date"] = df.index
    feats["ticker"] = ticker
    feats["close"] = close

    # Volatility
    feats["vol_5d"] = close.pct_change().rolling(5).std()
    feats["vol_20d"] = close.pct_change().rolling(20).std()
    feats["vol_60d"] = close.pct_change().rolling(60).std()

    # Max drawdown (60d)
    rolling_max = close.rolling(60, min_periods=1).max()
    feats["drawdown_60d"] = close / rolling_max - 1.0

    # Add fundamentals (constant across rows for this ticker)
    for k, v in fundamentals.items():
        feats[k] = v

    # Previous returns (trend features)
    feats["prev_return_5d"] = close.pct_change(5)
    feats["prev_return_20d"] = close.pct_change(20)
    feats["prev_return_60d"] = close.pct_change(60)

    return feats.reset_index(drop=True)

# ------------------------------- Main runner -------------------------------- #
def main_multiple_tickers(tickers, period="2y", output_file="all_tickers_features.csv"):
    all_features = []

    for ticker in tqdm(tickers, desc="Processing tickers"):
        print(f"\nFetching data for {ticker}…")
        try:
            # Historical prices
            df = yf.download(ticker, period=period, interval="1d", auto_adjust=False)
            if df.empty:
                print(f"Warning: No data for {ticker}, skipping.")
                continue

            # Fundamentals
            fundamentals = fetch_fundamentals(ticker)

            # Features
            features = compute_simple_features(df, ticker, fundamentals)

            # Future returns (for supervised labels)
            features['future_return_5d'] = features['close'].shift(-5) / features['close'] - 1
            features['future_return_20d'] = features['close'].shift(-20) / features['close'] - 1
            features['future_return_60d'] = features['close'].shift(-60) / features['close'] - 1

            # Labels: 1 if drop > 10%, else 0
            drop_threshold = 0.10
            features['label_5d'] = (features['future_return_5d'] < -drop_threshold).astype(int)
            features['label_20d'] = (features['future_return_20d'] < -drop_threshold).astype(int)
            features['label_60d'] = (features['future_return_60d'] < -drop_threshold).astype(int)

            all_features.append(features)

        except Exception as e:
            print(f"Error processing {ticker}: {e}")
            continue

    if all_features:
        final_df = pd.concat(all_features, ignore_index=True)
        final_df.to_csv(output_file, index=False)
        print(f"\n✅ Saved {final_df.shape[0]} rows to {output_file}")
    else:
        print("❌ No data processed.")

# ------------------------------- Run Example -------------------------------- #
if __name__ == "__main__":
    tickers_list = [
        # Technology
        "AAPL", "MSFT", "NVDA", "GOOG", "META", "ORCL", "IBM", "ADBE", "INTC", "CSCO",
        # Financials
        "JPM", "V", "BRK-B", "GS", "BAC", "MA",
        # Consumer Staples
        "KO", "PEP", "PG", "WMT",
        # Consumer Discretionary
        "AMZN", "DIS", "NFLX", "TSLA",
        # Industrials
        "BA", "CAT", "GE",
        # Energy
        "XOM", "CVX", "COP",
        # Healthcare
        "JNJ", "UNH", "PFE", "MRK",
        # Utilities / REITs
        "NEE", "PLD"
    ]

    main_multiple_tickers(tickers_list, period="2y", output_file="all_tickers_features.csv")


Processing tickers:   0%|          | 0/36 [00:00<?, ?it/s]


Fetching data for AAPL…


[*********************100%***********************]  1 of 1 completed
Processing tickers:   3%|▎         | 1/36 [00:00<00:31,  1.10it/s]


Fetching data for MSFT…


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed



Fetching data for NVDA…


Processing tickers:   8%|▊         | 3/36 [00:02<00:23,  1.42it/s]


Fetching data for GOOG…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  11%|█         | 4/36 [00:02<00:23,  1.37it/s]


Fetching data for META…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  14%|█▍        | 5/36 [00:03<00:23,  1.33it/s]


Fetching data for ORCL…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  17%|█▋        | 6/36 [00:04<00:23,  1.29it/s]


Fetching data for IBM…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  19%|█▉        | 7/36 [00:05<00:24,  1.18it/s]


Fetching data for ADBE…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  22%|██▏       | 8/36 [00:06<00:23,  1.19it/s]


Fetching data for INTC…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  25%|██▌       | 9/36 [00:07<00:25,  1.04it/s]


Fetching data for CSCO…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  28%|██▊       | 10/36 [00:08<00:24,  1.08it/s]


Fetching data for JPM…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  31%|███       | 11/36 [00:09<00:23,  1.08it/s]


Fetching data for V…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  33%|███▎      | 12/36 [00:10<00:21,  1.14it/s]


Fetching data for BRK-B…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  36%|███▌      | 13/36 [00:11<00:19,  1.16it/s]


Fetching data for GS…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  39%|███▉      | 14/36 [00:11<00:17,  1.25it/s]


Fetching data for BAC…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  42%|████▏     | 15/36 [00:12<00:16,  1.29it/s]


Fetching data for MA…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  44%|████▍     | 16/36 [00:13<00:15,  1.31it/s]


Fetching data for KO…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  47%|████▋     | 17/36 [00:14<00:15,  1.21it/s]


Fetching data for PEP…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  50%|█████     | 18/36 [00:14<00:14,  1.20it/s]


Fetching data for PG…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  53%|█████▎    | 19/36 [00:15<00:14,  1.19it/s]


Fetching data for WMT…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  56%|█████▌    | 20/36 [00:16<00:14,  1.10it/s]


Fetching data for AMZN…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  58%|█████▊    | 21/36 [00:17<00:12,  1.18it/s]


Fetching data for DIS…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  61%|██████    | 22/36 [00:18<00:12,  1.14it/s]


Fetching data for NFLX…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  64%|██████▍   | 23/36 [00:19<00:11,  1.18it/s]


Fetching data for TSLA…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  67%|██████▋   | 24/36 [00:20<00:09,  1.24it/s]


Fetching data for BA…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  69%|██████▉   | 25/36 [00:21<00:10,  1.09it/s]


Fetching data for CAT…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  72%|███████▏  | 26/36 [00:22<00:09,  1.11it/s]


Fetching data for GE…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  75%|███████▌  | 27/36 [00:23<00:08,  1.09it/s]


Fetching data for XOM…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  78%|███████▊  | 28/36 [00:23<00:07,  1.08it/s]


Fetching data for CVX…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  81%|████████  | 29/36 [00:24<00:06,  1.10it/s]


Fetching data for COP…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  83%|████████▎ | 30/36 [00:25<00:05,  1.14it/s]


Fetching data for JNJ…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  86%|████████▌ | 31/36 [00:26<00:04,  1.13it/s]


Fetching data for UNH…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  89%|████████▉ | 32/36 [00:27<00:03,  1.15it/s]


Fetching data for PFE…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  92%|█████████▏| 33/36 [00:28<00:02,  1.17it/s]


Fetching data for MRK…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  94%|█████████▍| 34/36 [00:29<00:01,  1.12it/s]


Fetching data for NEE…


[*********************100%***********************]  1 of 1 completed
Processing tickers:  97%|█████████▋| 35/36 [00:30<00:00,  1.13it/s]


Fetching data for PLD…


[*********************100%***********************]  1 of 1 completed
Processing tickers: 100%|██████████| 36/36 [00:30<00:00,  1.17it/s]
  final_df = pd.concat(all_features, ignore_index=True)



✅ Saved 18108 rows to all_tickers_features.csv


In [4]:
import pandas as pd

# Input / Output paths
INPUT_CSV = "features_with_ratios.csv"
OUTPUT_CSV = "all_tickers_features_clean.csv"

# Load data
df = pd.read_csv(INPUT_CSV)

# Drop rows with any NA/empty values
df_clean = df.dropna()
df = df.drop(columns=["close"])
# Save cleaned file
df_clean.to_csv(OUTPUT_CSV, index=False)

print(f"✅ Cleaned data saved to {OUTPUT_CSV}")
print(f"Original rows: {len(df)}, Cleaned rows: {len(df_clean)}")


✅ Cleaned data saved to all_tickers_features_clean.csv
Original rows: 13788, Cleaned rows: 13788


In [5]:
import pandas as pd
import numpy as np

def add_decayed_sentiment_per_ticker(
    df: pd.DataFrame,
    ticker_col: str = "ticker",
    date_col: str = "date",
    sent_col: str = "sentiment_score",
    decay: float = 0.8,
    scale: float = 20.0,
    gap_aware: bool = False  # if True, decay^Δdays across date gaps
) -> pd.DataFrame:
    """
    Adds a 'decayed_sentiment' column computed as:
        score_t = (score_{t-1} * decay_eff) + sentiment_t * scale
    computed independently within each ticker, in date order.
    First row per ticker uses score = sentiment * scale.

    gap_aware=True -> decay_eff = decay ** Δdays (Δdays >= 1)
    gap_aware=False -> decay_eff = decay for every step.
    """
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col])

    def _per_ticker(g: pd.DataFrame) -> pd.DataFrame:
        g = g.sort_values(date_col).copy()
        prev_score = 0.0
        prev_date = None
        scores = []

        for _, row in g.iterrows():
            s = float(row[sent_col])
            if gap_aware and prev_date is not None:
                delta_days = max((row[date_col] - prev_date).days, 1)
                decay_eff = decay ** delta_days
            else:
                decay_eff = decay

            current = prev_score * decay_eff + s * scale
            scores.append(current)
            prev_score = current
            prev_date = row[date_col]

        g["decayed_sentiment"] = scores
        return g

    out = (out
           .sort_values([ticker_col, date_col])
           .groupby(ticker_col, group_keys=False)
           .apply(_per_ticker))
    return out

# ---------- Usage on your CSV ----------
# Replace with your actual file name
input_csv = "all_tickers_features_clean.csv"
output_csv = "final training.csv"

df = pd.read_csv(input_csv)

# Compute decayed sentiment per ticker
# tweak decay/scale as you like; set gap_aware=True if you want decay across calendar gaps
df_out = add_decayed_sentiment_per_ticker(
    df,
    ticker_col="ticker",
    date_col="date",
    sent_col="sentiment_score",
    decay=0.8,
    scale=1,
    gap_aware=False
)

df_out.to_csv(output_csv, index=False)
print(f"✅ Saved with decayed column -> {output_csv}")

  .apply(_per_ticker))


✅ Saved with decayed column -> final training.csv


In [None]:
pip install --upgrade xgboost




In [23]:
# ---------------- TRAINING SCRIPT ---------------- #
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
import joblib
import numpy as np

# ---------------- CONFIG ---------------- #
CUSTOM_THRESHOLD = 0.2   # lower cutoff → more recall for class 1

# ---------------- Load Data ---------------- #
df = pd.read_csv("final training.csv", parse_dates=["date"])

# ✅ Feature columns (technical + fundamentals + sentiment)
feature_cols = [
    "vol_5d", "vol_20d", "vol_60d",
    "drawdown_60d", "de_ratio",
    "prev_return_5d", "prev_return_20d", "prev_return_60d",
    "decayed_sentiment"
]

# ✅ Label columns
label_cols = ["label_5d", "label_20d", "label_60d"]

# ---------------- Data Cleaning ---------------- #
# Only keep columns that actually exist in the CSV
missing = [c for c in feature_cols + label_cols if c not in df.columns]
if missing:
    print(f"⚠️ Warning: Missing columns in dataset: {missing}")
    feature_cols = [c for c in feature_cols if c in df.columns]
    label_cols = [c for c in label_cols if c in df.columns]

# Drop rows with NaN in required columns
df = df.dropna(subset=feature_cols + label_cols).reset_index(drop=True)

# ---------------- Features / Labels ---------------- #
X = df[feature_cols]
y_dict = {label: df[label] for label in label_cols}

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# To store trained models & predicted probabilities
models = {}
probabilities = pd.DataFrame({"date": df["date"], "ticker": df["ticker"]})

# ---------------- Train per label ---------------- #
for label_name in tqdm(label_cols, desc="Training labels"):
    y = y_dict[label_name]

    # Stratified split to preserve class balance
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle class imbalance → weight minority class (1 = risky)
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    model = XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1
    )

    # Train
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    # Predict probabilities
    y_pred_prob_test = model.predict_proba(X_test)[:, 1]
    probabilities[f"prob_{label_name}"] = model.predict_proba(X_scaled)[:, 1]

    # Evaluate with default 0.5 threshold
    auc = roc_auc_score(y_test, y_pred_prob_test)
    print(f"\n📊 ROC-AUC for {label_name}: {auc:.4f}")
    print("Default threshold (0.5):")
    print(classification_report(y_test, (y_pred_prob_test >= 0.5).astype(int)))

    # Evaluate with custom threshold
    print(f"\nCustom threshold ({CUSTOM_THRESHOLD}):")
    print(classification_report(y_test, (y_pred_prob_test >= CUSTOM_THRESHOLD).astype(int)))

    # Save model
    models[label_name] = model
    joblib.dump(model, f"xgb_model_{label_name}.pkl")

# ---------------- Save Outputs ---------------- #
probabilities.to_csv("predicted_probabilities.csv", index=False)
print("\n✅ All models trained and probabilities saved!")

# Save the fitted scaler for inference
joblib.dump(scaler, "scaler.pkl")
print("✅ Scaler saved for inference!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training labels:  33%|███▎      | 1/3 [00:00<00:01,  1.04it/s]


📊 ROC-AUC for label_5d: 0.8300
Default threshold (0.5):
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2709
           1       0.27      0.39      0.32        49

    accuracy                           0.97      2758
   macro avg       0.63      0.68      0.65      2758
weighted avg       0.98      0.97      0.97      2758


Custom threshold (0.2):
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      2709
           1       0.11      0.57      0.19        49

    accuracy                           0.91      2758
   macro avg       0.55      0.75      0.57      2758
weighted avg       0.98      0.91      0.94      2758



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training labels:  67%|██████▋   | 2/3 [00:03<00:01,  1.67s/it]


📊 ROC-AUC for label_20d: 0.9156
Default threshold (0.5):
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      2571
           1       0.49      0.69      0.57       187

    accuracy                           0.93      2758
   macro avg       0.73      0.82      0.77      2758
weighted avg       0.94      0.93      0.94      2758


Custom threshold (0.2):
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      2571
           1       0.20      0.91      0.32       187

    accuracy                           0.74      2758
   macro avg       0.59      0.82      0.58      2758
weighted avg       0.94      0.74      0.81      2758



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training labels: 100%|██████████| 3/3 [00:04<00:00,  1.56s/it]


📊 ROC-AUC for label_60d: 0.9535
Default threshold (0.5):
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2369
           1       0.61      0.85      0.71       389

    accuracy                           0.90      2758
   macro avg       0.79      0.88      0.82      2758
weighted avg       0.92      0.90      0.91      2758


Custom threshold (0.2):
              precision    recall  f1-score   support

           0       0.99      0.73      0.84      2369
           1       0.37      0.96      0.54       389

    accuracy                           0.77      2758
   macro avg       0.68      0.85      0.69      2758
weighted avg       0.90      0.77      0.80      2758


✅ All models trained and probabilities saved!
✅ Scaler saved for inference!





In [None]:
# better explainabilty
import pandas as pd
import yfinance as yf
import numpy as np
import joblib
import shap
import matplotlib.pyplot as plt
import json

# -------------------- Nearest Trading Day -------------------- #
def get_nearest_trading_day(df, target_date):
    target_date = pd.to_datetime(target_date)
    available_dates = df.index
    nearest_date = available_dates[available_dates <= target_date].max()
    return nearest_date

# -------------------- Fundamentals -------------------- #
def fetch_fundamentals(ticker: str) -> dict:
    """Fetch multiple fundamental ratios from Yahoo Finance."""
    try:
        info = yf.Ticker(ticker).info
        return {
            "de_ratio": info.get("debtToEquity"),
            "current_ratio": info.get("currentRatio"),
            "quick_ratio": info.get("quickRatio"),
            "roa": info.get("returnOnAssets"),
            "roe": info.get("returnOnEquity"),
            "profit_margin": info.get("profitMargins"),
        }
    except Exception as e:
        print(f"Error fetching fundamentals for {ticker}: {e}")
        return {k: None for k in [
            "de_ratio", "current_ratio", "quick_ratio",
            "roa", "roe", "profit_margin"
        ]}

# -------------------- Feature Calculation -------------------- #
def compute_features_for_inference(df: pd.DataFrame, ticker: str, fundamentals: dict, target_date: str) -> dict:
    df = df.sort_index()
    close = df["Adj Close"]

    # align to nearest trading day
    if target_date not in close.index.strftime('%Y-%m-%d'):
        nearest_date = get_nearest_trading_day(close, target_date)
        if pd.isna(nearest_date):
            raise ValueError(f"No trading data available near {target_date} for {ticker}")
        print(f"Using nearest trading day: {nearest_date.strftime('%Y-%m-%d')} instead of {target_date}")
        target_date = nearest_date.strftime('%Y-%m-%d')

    # Volatility
    vol_5d = close.pct_change().rolling(5).std()
    vol_20d = close.pct_change().rolling(20).std()
    vol_60d = close.pct_change().rolling(60).std()

    # Max drawdown
    rolling_max = close.rolling(60, min_periods=1).max()
    drawdown_60d = close / rolling_max - 1.0

    # Previous returns
    prev_return_5d = close.pct_change(5)
    prev_return_20d = close.pct_change(20)
    prev_return_60d = close.pct_change(60)

    # Extract features for target date
    date_idx = close.index.get_loc(pd.to_datetime(target_date))
    features = {
        "date": target_date,
        "ticker": ticker,
        "vol_5d": vol_5d.iloc[date_idx],
        "vol_20d": vol_20d.iloc[date_idx],
        "vol_60d": vol_60d.iloc[date_idx],
        "drawdown_60d": drawdown_60d.iloc[date_idx],
        "prev_return_5d": prev_return_5d.iloc[date_idx],
        "prev_return_20d": prev_return_20d.iloc[date_idx],
        "prev_return_60d": prev_return_60d.iloc[date_idx],
    }
    features.update(fundamentals)
    return features

def get_ticker_features(ticker: str, target_date: str, lookback_years: int = 2) -> dict:
    df = yf.download(ticker, period=f"{lookback_years}y", interval="1d", auto_adjust=False)
    if df.empty:
        raise ValueError(f"No data fetched for {ticker}.")
    fundamentals = fetch_fundamentals(ticker)
    features = compute_features_for_inference(df, ticker, fundamentals, target_date)
    return features

# -------------------- Load Models -------------------- #
models = {
    "label_5d": joblib.load("xgb_model_label_5d.pkl"),
    "label_20d": joblib.load("xgb_model_label_20d.pkl"),
    "label_60d": joblib.load("xgb_model_label_60d.pkl"),
}
feature_cols = [
    "vol_5d", "vol_20d", "vol_60d",
    "drawdown_60d", "de_ratio",
    "current_ratio", "quick_ratio",
    "roa", "roe", "profit_margin",
    "prev_return_5d", "prev_return_20d", "prev_return_60d"
]

# -------------------- Creditworthiness Formula -------------------- #
def creditworthiness_from_prob(prob: float) -> float:
    """
    Transform risk probability into a creditworthiness score (300–850 scale).
    """
    prob = np.clip(prob, 1e-6, 1 - 1e-6)
    score = 800 / (1 + np.exp(5 * (prob - 0.5)))  # logistic curve
    score = 300 + (score / 800) * 550
    return round(score, 2)

# -------------------- Main Calculation with Explainability -------------------- #
def calculate_creditworthiness_with_explain(features: dict, method: str = "weighted"):
    df = pd.DataFrame([features])

    # Scale
    scaler = joblib.load("scaler.pkl")
    X_scaled = scaler.transform(df[feature_cols])

    probs = {}
    shap_metadata = {}

    for label, model in models.items():
        prob = model.predict_proba(X_scaled)[:, 1][0]
        probs[label] = float(prob)  # ensure JSON serializable

        # ---- SHAP local explanation ----
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_scaled)

        # Collect metadata for JSON
        shap_metadata[label] = {
            "base_value": float(explainer.expected_value),
            "feature_values": {
                f: float(v) if not np.isnan(v) else None
                for f, v in zip(feature_cols, X_scaled[0])
            },
            "shap_values": {
                f: float(v) if not np.isnan(v) else None
                for f, v in zip(feature_cols, shap_values[0])
            }
        }

    # ---- Aggregate probability ----
    if method == "weighted":
        weights = {"label_5d": 0.3, "label_20d": 0.4, "label_60d": 0.3}
        avg_prob = sum(probs[label] * weights[label] for label in probs)
    elif method == "geometric":
        avg_prob = np.prod(list(probs.values())) ** (1 / len(probs))
    elif method == "exponential":
        avg_prob = np.mean(list(probs.values())) ** 1.5
    else:
        avg_prob = np.mean(list(probs.values()))

    creditworthiness = creditworthiness_from_prob(avg_prob)

    return creditworthiness, probs, shap_metadata



# -------------------- JSON Output -------------------- #
def calculate_creditworthiness_with_explain_json(features: dict, method: str = "weighted"):
    creditworthiness, probs = calculate_creditworthiness_with_explain(features, method)
    # ---- Build JSON result ----
    result = {
        "ticker": features["ticker"],
        "date": features["date"],
        "creditworthiness": creditworthiness,
        "risk_probs": probs,
        "shap_explanations": shap_metadata
    }
    return json.dumps(result, indent=2)

# -------------------- Example Usage -------------------- #
if __name__ == "__main__":
    ticker = "AMZN"
    target_date = "2025-03-20"

    features = get_ticker_features(ticker, target_date)
    result_json = calculate_creditworthiness_with_explain(features, method="weighted")
    print(result_json)


[*********************100%***********************]  1 of 1 completed
  array = numpy.asarray(array, order=order, dtype=dtype)


(np.float64(623.16), {'label_5d': 0.2629871070384979, 'label_20d': 0.8717597126960754, 'label_60d': 0.005421677604317665}, {'label_5d': {'base_value': -0.02495281957089901, 'feature_values': {'vol_5d': 0.008297601571907131, 'vol_20d': 0.1792384566677455, 'vol_60d': -0.09668200482468826, 'drawdown_60d': -1.6894718901129662, 'de_ratio': -0.48729005325941716, 'current_ratio': -0.3746794461940098, 'quick_ratio': -0.2998552337251468, 'roa': -0.2773024710618573, 'roe': -0.33141132232538495, 'profit_margin': -0.6032362440684125, 'prev_return_5d': 0.03251073719905767, 'prev_return_20d': -1.616895591929384, 'prev_return_60d': -1.1530990227904674}, 'shap_values': {'vol_5d': -0.03189438208937645, 'vol_20d': 0.5902968645095825, 'vol_60d': -0.5501396059989929, 'drawdown_60d': 0.34279865026474, 'de_ratio': -0.05299476534128189, 'current_ratio': -0.3305078446865082, 'quick_ratio': 0.002138230949640274, 'roa': -0.2952125370502472, 'roe': 0.03989440202713013, 'profit_margin': -0.13787221908569336, 'pre

In [2]:
pip install gnews

Collecting gnews
  Downloading gnews-0.4.2-py3-none-any.whl.metadata (19 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dnspython (from gnews)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading gnews-0.4.2-py3-none-any.whl (18 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=s

In [25]:
# final_inference.py
import os
import json
import pandas as pd
import yfinance as yf
import numpy as np
import joblib
import shap
from gnews import GNews
from transformers import pipeline
from datetime import datetime, timedelta

# -------------------- Fundamentals -------------------- #
def fetch_fundamentals(ticker: str) -> dict:
    try:
        info = yf.Ticker(ticker).info
        return {
            "de_ratio": info.get("debtToEquity", 0.0),
            "current_ratio": info.get("currentRatio", 0.0),
            "quick_ratio": info.get("quickRatio", 0.0),
            "roa": info.get("returnOnAssets", 0.0),
            "roe": info.get("returnOnEquity", 0.0),
            "profit_margin": info.get("profitMargins", 0.0),
        }
    except Exception:
        return {
            "de_ratio": 0.0, "current_ratio": 0.0, "quick_ratio": 0.0,
            "roa": 0.0, "roe": 0.0, "profit_margin": 0.0
        }

# -------------------- Sentiment & News -------------------- #
def get_company_name(stock_ticker: str) -> str:
    try:
        company = yf.Ticker(stock_ticker)
        return company.info.get("longName", "")
    except Exception:
        return ""

def fetch_company_news(company_name: str, date: str, window: int = 3, max_articles: int = 5) -> list:
    try:
        google_news = GNews(language="en", country="US", max_results=max_articles)
        target_date = datetime.strptime(date, "%Y-%m-%d")
        start_date = target_date - timedelta(days=window)
        end_date = target_date + timedelta(days=window)

        if start_date.date() == end_date.date():
            end_date += timedelta(days=1)

        google_news.start_date = (start_date.year, start_date.month, start_date.day)
        google_news.end_date = (end_date.year, end_date.month, end_date.day)

        news_results = google_news.get_news(company_name)
        return news_results[:max_articles] if news_results else []
    except Exception:
        return []

def analyze_sentiment_with_hf(news_data: list) -> float:
    if not news_data:
        return 0.0
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    )
    total_score, count = 0, 0
    for article in news_data:
        try:
            text_to_analyze = f"{article.get('title', '')}. {article.get('description', '')}"
            sentiment = sentiment_pipeline(text_to_analyze)[0]
            label, score = sentiment["label"].upper(), sentiment["score"]
            mapped = score if label == "POSITIVE" else -score if label == "NEGATIVE" else 0.0
            total_score += mapped
            count += 1
        except Exception:
            continue
    return round(total_score / count, 4) if count > 0 else 0.0

# -------------------- Decayed Sentiment -------------------- #
def sentiment_decay(dates, sentiments, decay=0.8, scale=1.0):
    df = pd.DataFrame({"date": pd.to_datetime(dates), "sentiment": sentiments})
    df = df.sort_values("date").reset_index(drop=True)
    decayed_scores, prev_score = [], 0
    for _, row in df.iterrows():
        s = row["sentiment"]
        new_impact = s * scale
        current_score = prev_score * decay + new_impact
        decayed_scores.append(current_score)
        prev_score = current_score
    df["decayed_score"] = decayed_scores
    return df

def compute_sentiment_features(ticker: str, target_date: str, lookback_days: int = 5) -> tuple[float, float]:
    company_name = get_company_name(ticker)
    if not company_name:
        return 0.0, 0.0
    dates, sentiments = [], []
    for i in range(lookback_days, -1, -1):
        day = (pd.to_datetime(target_date) - timedelta(days=i)).strftime("%Y-%m-%d")
        news = fetch_company_news(company_name, day, window=0, max_articles=5)
        score = analyze_sentiment_with_hf(news)
        dates.append(day)
        sentiments.append(score)

    df = sentiment_decay(dates, sentiments, decay=0.8, scale=1.0)
    decayed_score = float(df["decayed_score"].iloc[-1])
    latest_raw_score = float(df["sentiment"].iloc[-1])
    return decayed_score, latest_raw_score

# -------------------- Technicals -------------------- #
def safe_get(series, date):
    val = series.loc[date]
    if isinstance(val, pd.Series):
        return float(val.iloc[-1] if not val.empty else 0.0)
    return float(val or 0.0)

def compute_features_for_inference(df: pd.DataFrame, fundamentals: dict, target_date: str) -> dict:
    df = df.sort_index()
    df = df[~df.index.duplicated(keep='last')]
    close = df["Adj Close"]
    dt_target = pd.to_datetime(target_date)

    df_filtered = df[df.index <= dt_target]
    if df_filtered.empty:
        raise ValueError(f"No trading data available on or before {target_date}")
    actual_date = df_filtered.index.max()

    vol_5d = close.pct_change().rolling(5).std()
    vol_20d = close.pct_change().rolling(20).std()
    vol_60d = close.pct_change().rolling(60).std()
    rolling_max = close.rolling(60, min_periods=1).max()
    drawdown_60d = close / rolling_max - 1.0
    prev_return_5d = close.pct_change(5)
    prev_return_20d = close.pct_change(20)
    prev_return_60d = close.pct_change(60)

    features = {
        "vol_5d": safe_get(vol_5d, actual_date),
        "vol_20d": safe_get(vol_20d, actual_date),
        "vol_60d": safe_get(vol_60d, actual_date),
        "drawdown_60d": safe_get(drawdown_60d, actual_date),
        "prev_return_5d": safe_get(prev_return_5d, actual_date),
        "prev_return_20d": safe_get(prev_return_20d, actual_date),
        "prev_return_60d": safe_get(prev_return_60d, actual_date),
    }

    # Include fundamentals
    features.update({k: float(v or 0.0) for k, v in fundamentals.items()})

    features["date"] = actual_date.strftime('%Y-%m-%d')
    return features

def get_ticker_features(ticker: str, target_date: str, lookback_years: int = 2) -> dict:
    df = yf.download(ticker, period=f"{lookback_years}y", interval="1d", auto_adjust=False, progress=False)
    fundamentals = fetch_fundamentals(ticker)
    features = compute_features_for_inference(df, fundamentals, target_date)

    # Add sentiment
    decayed_sentiment, _ = compute_sentiment_features(ticker, features["date"], lookback_days=5)
    features["decayed_sentiment"] = float(decayed_sentiment)

    return features

# -------------------- Models & Features -------------------- #
models = {
    "label_5d": joblib.load("xgb_model_label_5d.pkl"),
    "label_20d": joblib.load("xgb_model_label_20d.pkl"),
    "label_60d": joblib.load("xgb_model_label_60d.pkl"),
}

feature_cols = [
    "vol_5d", "vol_20d", "vol_60d",
    "drawdown_60d", "de_ratio",
    "prev_return_5d", "prev_return_20d", "prev_return_60d",
    "decayed_sentiment"
]

scaler = joblib.load("scaler.pkl")

# -------------------- Creditworthiness -------------------- #
def creditworthiness_from_prob(prob: float) -> float:
    prob = np.clip(prob, 1e-6, 1 - 1e-6)
    score = 800 / (1 + np.exp(5 * (prob - 0.5)))
    score = 300 + (score / 800) * 550
    return round(score, 2)

def calculate_creditworthiness_with_explain(features: dict, method: str = "weighted"):
    df = pd.DataFrame([features])
    X_scaled = scaler.transform(df[feature_cols])

    probs, shap_metadata = {}, {}
    for label, model in models.items():
        prob = model.predict_proba(X_scaled)[:, 1][0]
        probs[label] = float(prob)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_scaled)
        shap_metadata[label] = {
            "base_value": float(explainer.expected_value),
            "feature_values": {f: float(v) for f, v in zip(feature_cols, X_scaled[0])},
            "shap_values": {f: float(v) for f, v in zip(feature_cols, shap_values[0])}
        }

    if method == "weighted":
        weights = {"label_5d": 0.3, "label_20d": 0.4, "label_60d": 0.3}
        avg_prob = sum(probs[label] * weights[label] for label in probs)
    else:
        avg_prob = np.mean(list(probs.values()))

    creditworthiness = creditworthiness_from_prob(avg_prob)
    return creditworthiness, probs, shap_metadata

def calculate_creditworthiness_with_explain_json(features: dict, method: str = "weighted"):
    # Keep strings as-is, numeric as float
    features = {k: float(v) if isinstance(v, (int, float, np.number)) else v for k, v in features.items()}
    creditworthiness, probs, shap_metadata = calculate_creditworthiness_with_explain(features, method)
    return json.dumps({
        "ticker": features.get("ticker", ""),
        "date": features.get("date", ""),
        "creditworthiness": creditworthiness,
        "risk_probs": probs,
        "shap_explanations": shap_metadata
    }, indent=2)

# -------------------- Example -------------------- #
if __name__ == "__main__":
    ticker = "AAPL"
    target_date = "2025-03-20"

    print(f"Fetching features for {ticker} on {target_date}...")
    features = get_ticker_features(ticker, target_date)
    features["ticker"] = ticker
    print("Features fetched. Calculating creditworthiness...")
    result_json = calculate_creditworthiness_with_explain_json(features, method="weighted")
    print(result_json)


Fetching features for AAPL on 2025-03-20...
Features fetched. Calculating creditworthiness...
{
  "ticker": "AAPL",
  "date": "2025-03-20",
  "creditworthiness": 788.62,
  "risk_probs": {
    "label_5d": 0.004004129208624363,
    "label_20d": 0.17098811268806458,
    "label_60d": 0.05164482071995735
  },
  "shap_explanations": {
    "label_5d": {
      "base_value": -0.02012277953326702,
      "feature_values": {
        "vol_5d": -0.3989725071591474,
        "vol_20d": 0.08216323751156003,
        "vol_60d": 0.048792744960614844,
        "drawdown_60d": -1.3994656870166289,
        "de_ratio": -0.5199108263249623,
        "prev_return_5d": 0.37996425365089115,
        "prev_return_20d": -1.6674786461599393,
        "prev_return_60d": -1.2751907627448145,
        "decayed_sentiment": -0.4519842699503451
      },
      "shap_values": {
        "vol_5d": -0.5016213059425354,
        "vol_20d": -0.5121458172798157,
        "vol_60d": -0.882170557975769,
        "drawdown_60d": -0.87248265

In [29]:
# final_inference.py
import os
import json
import pandas as pd
import yfinance as yf
import numpy as np
import joblib
import shap
from gnews import GNews
from transformers import pipeline
from datetime import datetime, timedelta

training_df = pd.read_csv("final training.csv", parse_dates=["date"])

# -------------------- Fundamentals -------------------- #
def fetch_fundamentals(ticker: str) -> dict:
    try:
        info = yf.Ticker(ticker).info
        return {
            "de_ratio": info.get("debtToEquity", 0.0),
            "current_ratio": info.get("currentRatio", 0.0),
            "quick_ratio": info.get("quickRatio", 0.0),
            "roa": info.get("returnOnAssets", 0.0),
            "roe": info.get("returnOnEquity", 0.0),
            "profit_margin": info.get("profitMargins", 0.0),
        }
    except Exception:
        return {
            "de_ratio": 0.0, "current_ratio": 0.0, "quick_ratio": 0.0,
            "roa": 0.0, "roe": 0.0, "profit_margin": 0.0
        }

# -------------------- Sentiment & News -------------------- #
def get_company_name(stock_ticker: str) -> str:
    try:
        company = yf.Ticker(stock_ticker)
        return company.info.get("longName", "")
    except Exception:
        return ""

def fetch_company_news(company_name: str, date: str, window: int = 3, max_articles: int = 5) -> list:
    try:
        google_news = GNews(language="en", country="US", max_results=max_articles)
        target_date = datetime.strptime(date, "%Y-%m-%d")
        start_date = target_date - timedelta(days=window)
        end_date = target_date + timedelta(days=window)

        if start_date.date() == end_date.date():
            end_date += timedelta(days=1)

        google_news.start_date = (start_date.year, start_date.month, start_date.day)
        google_news.end_date = (end_date.year, end_date.month, end_date.day)

        news_results = google_news.get_news(company_name)
        return news_results[:max_articles] if news_results else []
    except Exception:
        return []

def analyze_sentiment_with_hf(news_data: list) -> float:
    if not news_data:
        return 0.0
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    )
    total_score, count = 0, 0
    for article in news_data:
        try:
            text_to_analyze = f"{article.get('title', '')}. {article.get('description', '')}"
            sentiment = sentiment_pipeline(text_to_analyze)[0]
            label, score = sentiment["label"].upper(), sentiment["score"]
            mapped = score if label == "POSITIVE" else -score if label == "NEGATIVE" else 0.0
            total_score += mapped
            count += 1
        except Exception:
            continue
    return round(total_score / count, 4) if count > 0 else 0.0

# -------------------- Decayed Sentiment -------------------- #
def sentiment_decay(dates, sentiments, decay=0.8, scale=1.0):
    df = pd.DataFrame({"date": pd.to_datetime(dates), "sentiment": sentiments})
    df = df.sort_values("date").reset_index(drop=True)
    decayed_scores, prev_score = [], 0
    for _, row in df.iterrows():
        s = row["sentiment"]
        new_impact = s * scale
        current_score = prev_score * decay + new_impact
        decayed_scores.append(current_score)
        prev_score = current_score
    df["decayed_score"] = decayed_scores
    return df

def compute_sentiment_features(ticker: str, target_date: str, lookback_days: int = 5) -> tuple[float, float]:
    company_name = get_company_name(ticker)
    if not company_name:
        return 0.0, 0.0
    dates, sentiments = [], []
    for i in range(lookback_days, -1, -1):
        day = (pd.to_datetime(target_date) - timedelta(days=i)).strftime("%Y-%m-%d")
        news = fetch_company_news(company_name, day, window=0, max_articles=5)
        score = analyze_sentiment_with_hf(news)
        dates.append(day)
        sentiments.append(score)

    df = sentiment_decay(dates, sentiments, decay=0.8, scale=1.0)
    decayed_score = float(df["decayed_score"].iloc[-1])
    latest_raw_score = float(df["sentiment"].iloc[-1])
    return decayed_score, latest_raw_score

# -------------------- Technicals -------------------- #
def safe_get(series, date):
    val = series.loc[date]
    if isinstance(val, pd.Series):
        return float(val.iloc[-1] if not val.empty else 0.0)
    return float(val or 0.0)

def compute_features_for_inference(df: pd.DataFrame, fundamentals: dict, target_date: str) -> dict:
    df = df.sort_index()
    df = df[~df.index.duplicated(keep='last')]
    close = df["Adj Close"]
    dt_target = pd.to_datetime(target_date)

    df_filtered = df[df.index <= dt_target]
    if df_filtered.empty:
        raise ValueError(f"No trading data available on or before {target_date}")
    actual_date = df_filtered.index.max()

    vol_5d = close.pct_change().rolling(5).std()
    vol_20d = close.pct_change().rolling(20).std()
    vol_60d = close.pct_change().rolling(60).std()
    rolling_max = close.rolling(60, min_periods=1).max()
    drawdown_60d = close / rolling_max - 1.0
    prev_return_5d = close.pct_change(5)
    prev_return_20d = close.pct_change(20)
    prev_return_60d = close.pct_change(60)

    features = {
        "vol_5d": safe_get(vol_5d, actual_date),
        "vol_20d": safe_get(vol_20d, actual_date),
        "vol_60d": safe_get(vol_60d, actual_date),
        "drawdown_60d": safe_get(drawdown_60d, actual_date),
        "prev_return_5d": safe_get(prev_return_5d, actual_date),
        "prev_return_20d": safe_get(prev_return_20d, actual_date),
        "prev_return_60d": safe_get(prev_return_60d, actual_date),
    }

    # Include fundamentals
    features.update({k: float(v or 0.0) for k, v in fundamentals.items()})

    features["date"] = actual_date.strftime('%Y-%m-%d')
    return features

def get_ticker_features(ticker: str, target_date: str, lookback_years: int = 2) -> dict:
    dt_target = pd.to_datetime(target_date)

    # Check if ticker & date exist in training CSV
    row = training_df[(training_df["ticker"] == ticker) & (training_df["date"] == dt_target)]
    if not row.empty:
        # Extract features directly from CSV
        features = row.iloc[0].to_dict()
        print("using csv")
        features['date'] = features['date'].strftime('%Y-%m-%d')
        # Ensure numeric fields are floats
        features = {k: float(v) if isinstance(v, (int, float, np.number)) else v for k, v in features.items()}
        return features

    # Otherwise, fetch from yfinance and compute
    df = yf.download(ticker, period=f"{lookback_years}y", interval="1d", auto_adjust=False, progress=False)
    fundamentals = fetch_fundamentals(ticker)
    features = compute_features_for_inference(df, fundamentals, target_date)

    # Add sentiment
    decayed_sentiment, _ = compute_sentiment_features(ticker, features["date"], lookback_days=5)
    features["decayed_sentiment"] = float(decayed_sentiment)
    return features

# -------------------- Models & Features -------------------- #
models = {
    "label_5d": joblib.load("xgb_model_label_5d.pkl"),
    "label_20d": joblib.load("xgb_model_label_20d.pkl"),
    "label_60d": joblib.load("xgb_model_label_60d.pkl"),
}

feature_cols = [
    "vol_5d", "vol_20d", "vol_60d",
    "drawdown_60d", "de_ratio",
    "prev_return_5d", "prev_return_20d", "prev_return_60d",
    "decayed_sentiment"
]

scaler = joblib.load("scaler.pkl")

# -------------------- Creditworthiness -------------------- #
def creditworthiness_from_prob(prob: float) -> float:
    prob = np.clip(prob, 1e-6, 1 - 1e-6)
    score = 800 / (1 + np.exp(5 * (prob - 0.5)))
    score = 300 + (score / 800) * 550
    return round(score, 2)

def calculate_creditworthiness_with_explain(features: dict, method: str = "weighted"):
    df = pd.DataFrame([features])
    X_scaled = scaler.transform(df[feature_cols])

    probs, shap_metadata = {}, {}
    for label, model in models.items():
        prob = model.predict_proba(X_scaled)[:, 1][0]
        probs[label] = float(prob)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_scaled)
        shap_metadata[label] = {
            "base_value": float(explainer.expected_value),
            "feature_values": {f: float(v) for f, v in zip(feature_cols, X_scaled[0])},
            "shap_values": {f: float(v) for f, v in zip(feature_cols, shap_values[0])}
        }

    if method == "weighted":
        weights = {"label_5d": 0.3, "label_20d": 0.4, "label_60d": 0.3}
        avg_prob = sum(probs[label] * weights[label] for label in probs)
    else:
        avg_prob = np.mean(list(probs.values()))

    creditworthiness = creditworthiness_from_prob(avg_prob)
    return creditworthiness, probs, shap_metadata

def calculate_creditworthiness_with_explain_json(features: dict, method: str = "weighted"):
    # Keep strings as-is, numeric as float
    features = {k: float(v) if isinstance(v, (int, float, np.number)) else v for k, v in features.items()}
    creditworthiness, probs, shap_metadata = calculate_creditworthiness_with_explain(features, method)
    return json.dumps({
        "ticker": features.get("ticker", ""),
        "date": features.get("date", ""),
        "creditworthiness": creditworthiness,
        "risk_probs": probs,
        "shap_explanations": shap_metadata
    }, indent=2)

# -------------------- Example -------------------- #
if __name__ == "__main__":
    ticker = "AAPL"
    target_date = "2025-08-22"

    print(f"Fetching features for {ticker} on {target_date}...")
    features = get_ticker_features(ticker, target_date)
    features["ticker"] = ticker
    print("Features fetched. Calculating creditworthiness...")
    result_json = calculate_creditworthiness_with_explain_json(features, method="weighted")
    print(result_json)


Fetching features for AAPL on 2025-08-22...
Features fetched. Calculating creditworthiness...
{
  "ticker": "AAPL",
  "date": "2025-08-22",
  "creditworthiness": 795.67,
  "risk_probs": {
    "label_5d": 0.00023433212481904775,
    "label_20d": 0.01304598804563284,
    "label_60d": 0.17518840730190277
  },
  "shap_explanations": {
    "label_5d": {
      "base_value": -0.02012277953326702,
      "feature_values": {
        "vol_5d": -0.41091570826948537,
        "vol_20d": 0.22536989182690628,
        "vol_60d": -0.44502199806956755,
        "drawdown_60d": 0.5694536865537014,
        "de_ratio": -0.5199108263249623,
        "prev_return_5d": -0.53698655137329,
        "prev_return_20d": 0.5190814216414819,
        "prev_return_60d": 0.6127477575883555,
        "decayed_sentiment": -0.4519842699503451
      },
      "shap_values": {
        "vol_5d": -1.6020745038986206,
        "vol_20d": -0.9940552115440369,
        "vol_60d": -2.145747423171997,
        "drawdown_60d": -1.1226099729