# AI for Market Trend Analysis — Run in Jupyter Notebook
This notebook reproduces the same pipeline as `market_trend_analysis.py` (feature engineering → model training → evaluation → save model).  
It also shows how to **load the saved model (`rf_trend_model.joblib`)** and generate predictions.

**Files used (keep in the same folder as this notebook):**
- `spx_daily.csv`
- `requirements.txt`
- `market_trend_analysis.py` (reference)
- (optional) `spx_trend_features_2005_2025.csv` and `rf_trend_model.joblib`

## 1) Install dependencies (run once)
If you are using Jupyter inside a virtual environment, run the cell below **once**.

In [1]:
# If you are running this inside Jupyter, you can install requirements like this:
# (If pip doesn't work, see the "Steps to run" section in the chat message.)
!pip install -r requirements.txt



## 2) Imports

In [2]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit

import joblib

## 3) Load raw dataset

In [3]:
RAW_CSV = "spx_daily.csv"
df = pd.read_csv(RAW_CSV)
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1789-05-01,0.51,0.51,0.51,0.51,0.0
1,1789-06-01,0.51,0.51,0.51,0.51,0.0
2,1789-07-01,0.5,0.5,0.5,0.5,0.0
3,1789-08-01,0.5,0.51,0.5,0.51,0.0
4,1789-09-01,0.51,0.51,0.5,0.51,0.0


## 4) Feature engineering (same logic as the script)

In [4]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values("Date").copy()

    # Returns
    df["ret_1"] = df["Close"].pct_change(1)
    df["ret_5"] = df["Close"].pct_change(5)
    df["ret_10"] = df["Close"].pct_change(10)

    # Volatility
    df["vol_10"] = df["ret_1"].rolling(10).std()
    df["vol_20"] = df["ret_1"].rolling(20).std()

    # Moving averages
    df["sma_10"] = df["Close"].rolling(10).mean()
    df["sma_20"] = df["Close"].rolling(20).mean()
    df["sma_50"] = df["Close"].rolling(50).mean()
    df["sma_200"] = df["Close"].rolling(200).mean()

    # Normalized distance of close from SMA
    for n in [10, 20, 50, 200]:
        df[f"close_sma{n}"] = (df["Close"] - df[f"sma_{n}"]) / df[f"sma_{n}"]

    # RSI(14)
    delta = df["Close"].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = (-delta.clip(upper=0)).rolling(14).mean()
    rs = gain / (loss + 1e-9)
    df["rsi_14"] = 100 - (100 / (1 + rs))

    # MACD (12-26 EMA) + signal (9 EMA)
    ema12 = df["Close"].ewm(span=12, adjust=False).mean()
    ema26 = df["Close"].ewm(span=26, adjust=False).mean()
    df["macd"] = ema12 - ema26
    df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
    df["macd_hist"] = df["macd"] - df["macd_signal"]

    # Volume change
    df["vol_chg_5"] = df["Volume"].replace(0, np.nan).pct_change(5)

    return df

def make_dataset(df: pd.DataFrame, horizon_days: int = 5):
    df = df.copy()

    # Trend regime label: bullish if SMA20 > SMA50
    df["trend_now"] = (df["sma_20"] > df["sma_50"]).astype(int)
    df["target"] = df["trend_now"].shift(-horizon_days)

    features = [
        "ret_1", "ret_5", "ret_10",
        "vol_10", "vol_20",
        "close_sma10", "close_sma20", "close_sma50", "close_sma200",
        "rsi_14",
        "macd", "macd_signal", "macd_hist",
        "vol_chg_5",
    ]

    df = df.dropna().copy()
    return df, features

## 5) Build processed dataset + save `spx_trend_features_2005_2025.csv`

In [5]:
PROCESSED_CSV = "spx_trend_features_2005_2025.csv"

df2 = df[df["Date"] >= "2005-01-01"].copy()
df2 = add_features(df2)
df2, features = make_dataset(df2, horizon_days=5)

keep_cols = ["Date", "Open", "High", "Low", "Close", "Volume"] + features + ["target"]
df2[keep_cols].to_csv(PROCESSED_CSV, index=False)

print("Saved:", PROCESSED_CSV, "rows=", len(df2))
df2.head()

Saved: spx_trend_features_2005_2025.csv rows= 5081


Unnamed: 0,Date,Open,High,Low,Close,Volume,ret_1,ret_5,ret_10,vol_10,...,close_sma20,close_sma50,close_sma200,rsi_14,macd,macd_signal,macd_hist,vol_chg_5,trend_now,target
34521,2005-10-17,1186.57,1191.21,1184.48,1190.1,1141428000.0,0.002975,0.002333,-0.029836,0.006924,...,-0.012138,-0.022007,-0.00755,35.344037,-10.696988,-8.296368,-2.400619,-0.064399,0,0.0
34522,2005-10-18,1190.1,1190.1,1178.13,1178.14,1220561000.0,-0.01005,-0.00568,-0.029914,0.006933,...,-0.020309,-0.031119,-0.017425,30.21546,-11.179742,-8.873043,-2.306699,-0.044379,0,0.0
34523,2005-10-19,1178.14,1195.76,1170.55,1195.76,1501994000.0,0.014956,0.015352,-0.000527,0.00764,...,-0.00506,-0.016052,-0.002762,34.765178,-10.02498,-9.103431,-0.92155,0.085221,0,0.0
34524,2005-10-20,1195.76,1197.3,1173.3,1177.8,1454028000.0,-0.01502,0.000816,-0.01149,0.008954,...,-0.0185,-0.030011,-0.017716,29.023768,-10.438714,-9.370487,-1.068227,0.113179,0,0.0
34525,2005-10-21,1177.8,1186.46,1174.92,1179.59,1372733000.0,0.00152,-0.005883,-0.013638,0.00885,...,-0.015544,-0.027604,-0.016189,30.5764,-10.501113,-9.596612,-0.904501,0.12882,0,0.0


## 6) TimeSeriesSplit Cross-Validation (optional but recommended)

In [6]:
X = df2[features].astype(float)
y = df2["target"].astype(int)

tscv = TimeSeriesSplit(n_splits=5)
probs, preds, trues = [], [], []

cv_model = RandomForestClassifier(
    n_estimators=250,
    max_depth=8,
    min_samples_leaf=10,
    random_state=42,
    class_weight="balanced_subsample",
    n_jobs=-1
)

for tr_idx, te_idx in tscv.split(X):
    cv_model.fit(X.iloc[tr_idx], y.iloc[tr_idx])
    p = cv_model.predict_proba(X.iloc[te_idx])[:, 1]
    pr = (p >= 0.5).astype(int)
    probs.append(p)
    preds.append(pr)
    trues.append(y.iloc[te_idx].to_numpy())

probs = np.concatenate(probs)
preds = np.concatenate(preds)
trues = np.concatenate(trues)

cv_metrics = {
    "ROC_AUC": float(roc_auc_score(trues, probs)),
    "Accuracy": float(accuracy_score(trues, preds)),
    "F1": float(f1_score(trues, preds)),
}
cv_metrics

{'ROC_AUC': 0.990054832667142,
 'Accuracy': 0.9513002364066194,
 'F1': 0.9657920956492859}

## 7) Train/Test split (chronological) + evaluation on test set

In [7]:
split_date = pd.Timestamp("2023-01-01")
train_mask = df2["Date"] < split_date

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[~train_mask], y[~train_mask]

final_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,
    min_samples_leaf=10,
    random_state=42,
    class_weight="balanced_subsample",
    n_jobs=-1
)
final_model.fit(X_train, y_train)

p = final_model.predict_proba(X_test)[:, 1]
pred = (p >= 0.5).astype(int)

test_metrics = {
    "ROC_AUC": float(roc_auc_score(y_test, p)),
    "Accuracy": float(accuracy_score(y_test, pred)),
    "F1": float(f1_score(y_test, pred)),
    "Precision": float(precision_score(y_test, pred)),
    "Recall": float(recall_score(y_test, pred)),
    "ConfusionMatrix": confusion_matrix(y_test, pred).tolist()
}
test_metrics

{'ROC_AUC': 0.9956557638537034,
 'Accuracy': 0.9666221628838452,
 'F1': 0.9777777777777777,
 'Precision': 0.9700176366843033,
 'Recall': 0.985663082437276,
 'ConfusionMatrix': [[174, 17], [8, 550]]}

## 8) Save model to `rf_trend_model.joblib`

In [8]:
MODEL_OUT = "rf_trend_model.joblib"
joblib.dump({"model": final_model, "features": features}, MODEL_OUT)
print("Saved:", MODEL_OUT)

Saved: rf_trend_model.joblib


## 9) Load saved model + generate predictions

In [9]:
bundle = joblib.load("rf_trend_model.joblib")
model = bundle["model"]
feat = bundle["features"]

# Predict on the last 10 rows
last = df2.tail(10).copy()
X_last = last[feat].astype(float)

probs_last = model.predict_proba(X_last)[:, 1]
preds_last = (probs_last >= 0.5).astype(int)

out = last[["Date","Close"]].copy()
out["bullish_prob_5d"] = probs_last
out["predicted_trend_5d"] = preds_last
out

Unnamed: 0,Date,Close,bullish_prob_5d,predicted_trend_5d
39592,2025-12-12,6827.41,0.965961,1
39593,2025-12-15,6816.51,0.953843,1
39594,2025-12-16,6800.26,0.98881,1
39595,2025-12-17,6721.43,0.946751,1
39596,2025-12-18,6774.76,0.973052,1
39597,2025-12-19,6834.5,0.993494,1
39598,2025-12-22,6878.49,0.996545,1
39599,2025-12-23,6909.79,0.999416,1
39600,2025-12-24,6932.05,0.983955,1
39601,2025-12-26,6929.94,0.980313,1


## 10) Next step (UI)
To run the Streamlit UI (`app.py`), open a **terminal** in the same folder and run:

```bash
pip install streamlit
python -m streamlit run app.py
```

The UI loads `rf_trend_model.joblib` and predicts using `spx_trend_features_2005_2025.csv`.

## 11) Launch the Streamlit UI (100% beginner-safe, with diagnostics)

If the UI is not launching, it is almost always one of these:
- Jupyter is running in the wrong folder (so `app.py` / model / csv not found)
- Streamlit is not installed in the current Python kernel
- Port 8501 is already used
- Streamlit started but you did not open the URL in the browser

The cells below:
1) Show your current folder and files
2) Validate required files
3) Test Streamlit installation using `streamlit hello`
4) Launch your app on a free port and **auto-open the browser**


In [10]:
import os, sys, socket, textwrap
from pathlib import Path

print("Python:", sys.executable)
print("CWD:", os.getcwd())
print("\nFiles in CWD (first 50):")
for i, f in enumerate(sorted(os.listdir())):
    if i>=50: 
        print("..."); break
    print(" -", f)

required = ["app.py", "rf_trend_model.joblib", "spx_trend_features_2005_2025.csv"]
missing = [f for f in required if not Path(f).exists()]
print("\nRequired files:", required)
print("Missing files:", missing)

if missing:
    print("\n❌ Fix: Move ALL project files + this notebook into ONE folder, then reopen the notebook from that folder.")
    print("   OR run: os.chdir(r'YOUR_PROJECT_FOLDER_PATH')  (then re-run this cell)")


Python: C:\Anaconda\python.exe
CWD: C:\Users\Manish Kumar Singh\Desktop\AI_Market_Trend_Analysis

Files in CWD (first 50):
 - .ipynb_checkpoints
 - AI_Market_Trend_Analysis_Report_Style.docx
 - AI_Market_Trend_Analysis_Report_Style.pdf
 - AI_Market_Trend_Analysis_Slides.pdf
 - AI_Market_Trend_Analysis_Slides_VISUAL.pdf
 - AI_for_Market_Trend_Analysis.pptx
 - Market_Trend_Analysis_Run_in_Jupyter.ipynb
 - ModuleE_Submission (4).pdf
 - Project Submission Guidelines (1).pdf
 - To speak.docx
 - app.py
 - market_trend_analysis.py
 - requirements.txt
 - rf_trend_model.joblib
 - spx_daily.csv
 - spx_trend_features_2005_2025.csv

Required files: ['app.py', 'rf_trend_model.joblib', 'spx_trend_features_2005_2025.csv']
Missing files: []


In [11]:
import sys, socket, subprocess, time, webbrowser
from pathlib import Path

def port_free(port: int) -> bool:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("127.0.0.1", port)) != 0

# Ensure streamlit installed
try:
    import streamlit  # noqa
    print("✅ streamlit version:", streamlit.__version__)
except Exception as e:
    print("streamlit not found in this kernel. Installing now...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "streamlit"])
    import streamlit  # noqa
    print("✅ streamlit version:", streamlit.__version__)

# Quick sanity test (should open a demo page)
print("\nRunning 'streamlit hello' test (10 seconds)...")
test_proc = subprocess.Popen([sys.executable, "-m", "streamlit", "hello", "--server.headless", "true", "--server.port", "8509"],
                             stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(2)
print("✅ If Streamlit works, you can open: http://localhost:8509 (optional)")
# Stop test to free resources
test_proc.terminate()

# Choose port for your app
port = 8501 if port_free(8501) else (8502 if port_free(8502) else 8503)
url = f"http://localhost:{port}"
print("\nLaunching your app on:", url)

# Validate files before launch
for f in ["app.py"]:
    if not Path(f).exists():
        raise FileNotFoundError(f"'{f}' not found in current folder. Please fix CWD and try again.")

# Start streamlit (keep it running)
cmd = [sys.executable, "-m", "streamlit", "run", "app.py",
       "--server.headless", "true",
       "--server.port", str(port),
       "--browser.gatherUsageStats", "false"]

# Launch in a way that doesn't block notebook, and auto-open browser
proc = subprocess.Popen(cmd)
time.sleep(2)
webbrowser.open(url)

print("\n✅ UI should open in your browser now.")
print("If it doesn't open automatically, copy-paste this URL into Chrome:")
print(url)
print("\nTo STOP the UI later, run in a new cell:")
print("proc.terminate()")


✅ streamlit version: 1.51.0

Running 'streamlit hello' test (10 seconds)...
✅ If Streamlit works, you can open: http://localhost:8509 (optional)

Launching your app on: http://localhost:8501

✅ UI should open in your browser now.
If it doesn't open automatically, copy-paste this URL into Chrome:
http://localhost:8501

To STOP the UI later, run in a new cell:
proc.terminate()
