In [1]:
!uv sync

[2K[2mResolved [1m185 packages[0m [2min 527ms[0m[0m                                       [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/29)                                                  
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/29)------------------[0m[0m     0 B/384.84 KiB          [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/29)------------------[0m[0m     0 B/384.84 KiB          [1A
[2mwrapt               [0m [32m[30m[2m------------------------------[0m[0m     0 B/118.64 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/29)------------------[0m[0m     0 B/384.84 KiB          [2A
[2mwrapt               [0m [32m[30m[2m------------------------------[0m[0m     0 B/118.64 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/29)------------------[0m[0m     0 B/384.84 KiB          [2A
[2mwrapt               [0m [32m[30m[2m------------------------------[0m[0m     0 B/118.64 KiB
[2K[2A[37m⠙[0m [2mPreparing 

In [2]:
# Environment managed via uv/pyproject; ensure dependencies installed before running

import json
import logging
import os
import re

from functools import lru_cache
from pathlib import Path
from textwrap import dedent
from typing import Any, Dict

import numpy as np
import pandas as pd
import yfinance as yf

from dotenv import load_dotenv
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
import ta


In [None]:
load_dotenv()

OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")

# MODEL_NAME = "deepseek/deepseek-v3.2" # Slower model
MODEL_NAME = "deepseek/deepseek-chat-v3.1" # Faster model
DEFAULT_TICKER = "AAPL"
DEFAULT_PERIOD = "5y"
LOOKBACK_DAYS = 60
LLM_TEMPERATURE = 0.3
MAX_TOKENS = 800

LOG_FILE = Path("llm_decision_log.txt").resolve()
LOG_FILE.parent.mkdir(parents=True, exist_ok=True)


In [4]:
def fetch_stock_data(ticker: str, period: str = DEFAULT_PERIOD) -> pd.DataFrame:
    """Return daily OHLCV data for the requested ticker."""

    data = yf.download(
        tickers=ticker,
        period=period,
        interval="1d",
        auto_adjust=False,
        progress=False,
    )



    if data.empty:
        raise ValueError(f"No data retrieved for ticker {ticker} and period {period}.")



    # Normalize columns to 1D Series for OHLCV

    if isinstance(data.columns, pd.MultiIndex):
        # If multi-index, select the leaf for the ticker

        if ticker in data.columns.get_level_values(-1):
            data = data.xs(ticker, axis=1, level=-1)

        else:
            data.columns = data.columns.get_level_values(0)



    for col in ["Open", "High", "Low", "Close", "Adj Close", "Volume"]:

        if col in data.columns and isinstance(data[col], pd.DataFrame):
            data[col] = data[col].squeeze("columns")



    data = data.dropna(how="all")
    data.index = pd.to_datetime(data.index)
    return data



def compute_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """Augment OHLCV data with technical indicators and drop NaNs."""
    if df.empty:
        raise ValueError("Input price DataFrame is empty.")

    data = df.copy()
    close = data["Close"]
    high = data["High"]
    low = data["Low"]
    volume = data["Volume"]

    data["SMA_10"] = ta.trend.SMAIndicator(close=close, window=10).sma_indicator()
    data["SMA_20"] = ta.trend.SMAIndicator(close=close, window=20).sma_indicator()
    data["SMA_50"] = ta.trend.SMAIndicator(close=close, window=50).sma_indicator()
    data["SMA_100"] = ta.trend.SMAIndicator(close=close, window=100).sma_indicator()
    data["SMA_200"] = ta.trend.SMAIndicator(close=close, window=200).sma_indicator()
    data["EMA_12"] = ta.trend.EMAIndicator(close=close, window=12).ema_indicator()
    data["EMA_26"] = ta.trend.EMAIndicator(close=close, window=26).ema_indicator()
    data["RSI_14"] = ta.momentum.RSIIndicator(close=close, window=14).rsi()

    macd = ta.trend.MACD(close=close)
    data["MACD"] = macd.macd()
    data["MACD_Signal"] = macd.macd_signal()
    data["MACD_Hist"] = macd.macd_diff()

    bb = ta.volatility.BollingerBands(close=close, window=20, window_dev=2)
    data["BB_Upper"] = bb.bollinger_hband()
    data["BB_Middle"] = bb.bollinger_mavg()
    data["BB_Lower"] = bb.bollinger_lband()

    stoch = ta.momentum.StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3)
    data["Stoch_K"] = stoch.stoch()
    data["Stoch_D"] = stoch.stoch_signal()

    obv = ta.volume.OnBalanceVolumeIndicator(close=close, volume=volume)
    data["OBV"] = obv.on_balance_volume()

    data["Daily_Return"] = close.pct_change()
    data = data.dropna()

    return data


In [5]:
def build_llm_input(df: pd.DataFrame, lookback_days: int = LOOKBACK_DAYS) -> str:

    """Create a textual snapshot of recent price action and indicators."""

    if df.empty:
        raise ValueError("Cannot build LLM input from empty DataFrame.")

    window = df.tail(lookback_days)

    display_cols = [
        "Close",
        "SMA_10",
        "SMA_50",
        "SMA_200",
        "EMA_12",
        "EMA_26",
        "RSI_14",
        "MACD",
        "MACD_Signal",
        "MACD_Hist",
        "BB_Upper",
        "BB_Middle",
        "BB_Lower",
        "Stoch_K",
        "Stoch_D",
        "OBV",
        "Volume",
        "Daily_Return",
    ]

    available_cols = [col for col in display_cols if col in window.columns]

    snapshot = window[available_cols].copy()

    snapshot = snapshot.round({col: 4 for col in available_cols})

    lines = []

    for idx, row in snapshot.iterrows():
        line = (
            f"{idx.date().isoformat()} | "
            f"Close={float(row.get('Close', float('nan'))):.2f}, "
            f"SMA10={float(row.get('SMA_10', float('nan'))):.2f}, "
            f"SMA50={float(row.get('SMA_50', float('nan'))):.2f}, "
            f"SMA200={float(row.get('SMA_200', float('nan'))):.2f}, "
            f"EMA12={float(row.get('EMA_12', float('nan'))):.2f}, "
            f"EMA26={float(row.get('EMA_26', float('nan'))):.2f}, "
            f"RSI14={float(row.get('RSI_14', float('nan'))):.1f}, "
            f"MACD={float(row.get('MACD', float('nan'))):.3f}, "
            f"Signal={float(row.get('MACD_Signal', float('nan'))):.3f}, "
            f"Hist={float(row.get('MACD_Hist', float('nan'))):.3f}, "
            f"BBU={float(row.get('BB_Upper', float('nan'))):.2f}, "
            f"BBM={float(row.get('BB_Middle', float('nan'))):.2f}, "
            f"BBL={float(row.get('BB_Lower', float('nan'))):.2f}, "
            f"StochK={float(row.get('Stoch_K', float('nan'))):.1f}, "
            f"StochD={float(row.get('Stoch_D', float('nan'))):.1f}, "
            f"OBV={float(row.get('OBV', float('nan'))):.0f}, "
            f"Vol={float(row.get('Volume', float('nan'))):.0f}, "
            f"Ret={float(row.get('Daily_Return', float('nan'))):.4f}"
        )

        lines.append(line)

    price_change_pct = (window["Close"].iloc[-1] / window["Close"].iloc[0] - 1) * 100
    daily_returns = window["Daily_Return"].dropna()
    annual_vol = daily_returns.std() * np.sqrt(252) * 100 if not daily_returns.empty else 0.0
    avg_volume = window["Volume"].mean() if "Volume" in window.columns else float("nan")

    summary = dedent(
        f"""
        Recent performance summary:
        - Lookback window: {len(window)} trading days
        - Net close change: {price_change_pct:.2f}%
        - Annualized volatility (est.): {annual_vol:.2f}%
        - Average volume: {avg_volume:,.0f}
        """
    ).strip()

    return f"{summary}\n\nDaily snapshots (most recent last):\n" + "\n".join(lines)

def build_llm_prompt(ticker: str, market_snapshot: str, lookback_days: int = LOOKBACK_DAYS) -> str:
    """Compose the instruction payload for the LLM."""

    return dedent(

        f"""
        You are an expert quantitative analyst supporting an automated daily stock trading system.
        Evaluate the provided market context for ticker {ticker}.

        Data characteristics:
        - Frequency: daily candles (one decision per trading day).
        - Horizon: predict the next trading day's closing price behavior only.
        - Goal: produce BUY, SELL, HOLD confidence scores that sum to ~1.

        Market context extracted from the last {lookback_days} trading days:

        {market_snapshot}

        Instructions:
        - Analyze trends, momentum, volatility, and mean-reversion signals from the data.
        - Determine whether the next day's closing price is likely to rise sharply (BUY), fall sharply (SELL), or stay relatively neutral (HOLD).
        - Return a strict JSON object with keys: buy_confidence, sell_confidence, hold_confidence, next_day_view, explanation.
        - Confidence values must be floats between 0 and 1 and collectively sum to approximately 1.
        - Set next_day_view to BUY, SELL, or HOLD depending on the dominant signal.
        - Provide a concise explanation capturing your reasoning; this will be logged privately.
        - Do not include any additional text outside the JSON object.
        """
    ).strip()


In [6]:
@lru_cache(maxsize=1)

def get_llm_client() -> ChatOpenAI:
    """Instantiate and cache the LangChain ChatOpenAI client for OpenRouter."""

    if not OPENROUTER_API_KEY:
        raise EnvironmentError("OPENROUTER_API_KEY environment variable is not set.")

    return ChatOpenAI(
        model=MODEL_NAME,
        openai_api_key=OPENROUTER_API_KEY,
        openai_api_base=OPENROUTER_BASE_URL,
        temperature=LLM_TEMPERATURE,
        max_tokens=MAX_TOKENS,
        timeout=90,
    )

def invoke_llm(prompt: str) -> str:

    """Send the prompt to the LLM and return the raw text response."""

    client = get_llm_client()
    messages = [
        SystemMessage(
            content=(
                "You are a disciplined trading assistant. "
                "Follow instructions exactly and respond with strict JSON."
            )
        ),
        HumanMessage(content=prompt),
    ]

    response = client.invoke(messages)
    return response.content

def _strip_json_fences(text: str) -> str:
    cleaned = text.strip()
    if cleaned.startswith("```"):
        cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
        cleaned = re.sub(r"```$", "", cleaned).strip()

    return cleaned

def parse_llm_decision(raw_text: str) -> Dict[str, Any]:
    """Parse the LLM JSON payload and enforce expected structure."""

    cleaned = _strip_json_fences(raw_text)

    try:
        payload = json.loads(cleaned)

    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", cleaned, flags=re.DOTALL)

        if not match:
            raise ValueError("LLM response is not valid JSON.") from None

        payload = json.loads(match.group())

    required_keys = {"buy_confidence", "sell_confidence", "hold_confidence", "next_day_view", "explanation"}

    missing = required_keys - payload.keys()

    if missing:
        raise ValueError(f"LLM response missing keys: {sorted(missing)}")

    confidences = {}

    for key in ["buy_confidence", "sell_confidence", "hold_confidence"]:
        try:
            value = float(payload[key])
        except (TypeError, ValueError):
            raise ValueError(f"Confidence value for {key} is not numeric.") from None

        value = max(0.0, min(1.0, value))
        confidences[key] = value

    total = sum(confidences.values())

    if total <= 0:
        raise ValueError("Confidence scores sum to zero.")

    confidences = {k: v / total for k, v in confidences.items()}

    decision = {
        "buy_confidence": confidences["buy_confidence"],
        "sell_confidence": confidences["sell_confidence"],
        "hold_confidence": confidences["hold_confidence"],
        "next_day_view": str(payload.get("next_day_view", "HOLD")).upper().strip(),
        "explanation": str(payload.get("explanation", "")).strip(),
    }

    if decision["next_day_view"] not in {"BUY", "SELL", "HOLD"}:
        decision["next_day_view"] = "HOLD"

    return decision


In [7]:
logging.basicConfig(
    filename=str(LOG_FILE),
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    force=True,

)

def log_llm_decision(ticker: str, last_date: pd.Timestamp, payload: Dict[str, Any]) -> None:
    """Persist the LLM reasoning and scores to the log file."""

    date_str = pd.Timestamp(last_date).strftime("%Y-%m-%d")

    record = {
        "ticker": ticker,
        "last_data_date": date_str,
        "buy_confidence": payload["buy_confidence"],
        "sell_confidence": payload["sell_confidence"],
        "hold_confidence": payload["hold_confidence"],
        "next_day_view": payload["next_day_view"],
        "explanation": payload["explanation"],
    }

    logging.info(json.dumps(record, ensure_ascii=False))


In [8]:
def get_llm_decision_vector(
    ticker: str = DEFAULT_TICKER,
    period: str = DEFAULT_PERIOD,
    lookback_days: int = LOOKBACK_DAYS,
) -> tuple[np.ndarray, Dict[str, Any]]:
    """Generate the LLM-driven BUY/SELL/HOLD confidence vector and payload."""

    price_data = fetch_stock_data(ticker=ticker, period=period)
    enriched_data = compute_indicators(price_data)
    market_snapshot = build_llm_input(enriched_data, lookback_days=lookback_days)
    prompt = build_llm_prompt(
        ticker=ticker,
        market_snapshot=market_snapshot,
        lookback_days=lookback_days,
    )

    raw_response = invoke_llm(prompt)
    decision_payload = parse_llm_decision(raw_response)
    last_date = enriched_data.index[-1]
    log_llm_decision(ticker, last_date, decision_payload)

    vector = np.array(
        [
            decision_payload["buy_confidence"],
            decision_payload["sell_confidence"],
            decision_payload["hold_confidence"],
        ],
        dtype=float,
    )

    return vector, decision_payload

In [9]:
vector, payload = get_llm_decision_vector(DEFAULT_TICKER)

print(f"LLM decision vector for {DEFAULT_TICKER}: {vector}")
print(f"Decision for {DEFAULT_TICKER}: {payload.get('next_day_view', 'HOLD')}")
print(f"Reasoning: {payload.get('explanation', '')}")

LLM decision vector for NVDA: [0.45 0.25 0.3 ]
Decision for NVDA: BUY
Reasoning: Recent uptrend with price above key SMAs, strong momentum from MACD positive histogram and rising RSI (51.0), high volatility supports potential sharp moves, and oversold recovery signals from recent lows suggest bullish bias for next session.


In [11]:
import time

# ---------------------------------------------------------
# DATASET GENERATION CELL
# ---------------------------------------------------------

def generate_training_dataset(ticker=DEFAULT_TICKER, days=30, threshold=0.005):
    print(f"Generating training data for {ticker} over the last {days} days...")
    
    # 1. Fetch full history first
    # We use a longer period to ensure metrics (SMAs) are valid
    full_data = fetch_stock_data(ticker, period="2y")
    full_data = compute_indicators(full_data)
    
    valid_dates = full_data.index
    if len(valid_dates) < days + 20:
         print("Not enough data.")
         return

    # We want the last 'days' decision points where we ALSO have a known tomorrow.
    # Excluding the very last day (because its tomorrow doesn't exist yet for Ground Truth)
    target_indices = range(len(valid_dates) - days - 1, len(valid_dates) - 1)
    
    dataset = []
    count = 0
    
    for idx in target_indices:
        count += 1
        current_date = valid_dates[idx]
        next_date = valid_dates[idx+1]
        
        # Slice data to simulate "past" knowledge only
        past_data = full_data.loc[:current_date]
        
        try:
            # 1. Generate Decision Vector from LLM
            market_snapshot = build_llm_input(past_data, lookback_days=LOOKBACK_DAYS)
            prompt = build_llm_prompt(ticker, market_snapshot, lookback_days=LOOKBACK_DAYS)
            
            raw_response = invoke_llm(prompt)
            decision = parse_llm_decision(raw_response)
            
            # 2. Determine Ground Truth (Next Day Return)
            # 'Daily_Return' at next_date IS the return from current to next
            next_return = full_data.loc[next_date]['Daily_Return']
            
            if next_return > threshold:
                ground_truth = "BUY"
            elif next_return < -threshold:
                ground_truth = "SELL"
            else:
                ground_truth = "HOLD"
                
            print(f"[{count}/{days}] {current_date.date()} -> Prediction: {decision['next_day_view']:4} | Truth: {ground_truth:4} (Ret: {next_return:.2%})")
            
            dataset.append({
                "Date": current_date.date(),
                "Buy_Conf": decision["buy_confidence"],
                "Sell_Conf": decision["sell_confidence"],
                "Hold_Conf": decision["hold_confidence"],
                "LLM_View": decision["next_day_view"],
                "Ground_Truth": ground_truth,
                "Next_Return": next_return
            })

        except Exception as e:
            print(f"Error on {current_date.date()}: {e}")
        
        # Polite delay to avoid aggressive rate limits
        time.sleep(0.5)

    if dataset:
        df_out = pd.DataFrame(dataset)
        filename = "training_data.csv"
        df_out.to_csv(filename, index=False)
        print(f"\nSuccessfully saved {len(df_out)} rows to {filename}")
    else:
        print("No data generated.")

# Uncomment to run:
generate_training_dataset(days=30)


Generating training data for NVDA over the last 30 days...
[1/30] 2025-10-24 -> Prediction: BUY  | Truth: BUY  (Ret: 2.81%)
[2/30] 2025-10-27 -> Prediction: BUY  | Truth: BUY  (Ret: 4.98%)
[3/30] 2025-10-28 -> Prediction: BUY  | Truth: BUY  (Ret: 2.99%)
[4/30] 2025-10-29 -> Prediction: BUY  | Truth: SELL (Ret: -2.00%)
[5/30] 2025-10-30 -> Prediction: BUY  | Truth: HOLD (Ret: -0.20%)
[6/30] 2025-10-31 -> Prediction: BUY  | Truth: BUY  (Ret: 2.17%)
[7/30] 2025-11-03 -> Prediction: BUY  | Truth: SELL (Ret: -3.96%)
[8/30] 2025-11-04 -> Prediction: BUY  | Truth: SELL (Ret: -1.75%)
[9/30] 2025-11-05 -> Prediction: BUY  | Truth: SELL (Ret: -3.65%)
[10/30] 2025-11-06 -> Prediction: SELL | Truth: HOLD (Ret: 0.04%)
[11/30] 2025-11-07 -> Prediction: HOLD | Truth: BUY  (Ret: 5.79%)
[12/30] 2025-11-10 -> Prediction: BUY  | Truth: SELL (Ret: -2.96%)
[13/30] 2025-11-11 -> Prediction: BUY  | Truth: HOLD (Ret: 0.33%)
[14/30] 2025-11-12 -> Prediction: HOLD | Truth: SELL (Ret: -3.58%)
[15/30] 2025-11-13 