In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib

In [None]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
import os

# folder containing preprocessed CSVs
DATA_DIR = "data/processed/"
MODEL_DIR = "models/"

if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

# list all CSV files in processed folder
files = [f for f in os.listdir(DATA_DIR) if f.endswith(".csv")]

# features to use
FEATURES = ["Close", "High", "Low", "Open", "Volume", "daily_return", "vol_20", "sentiment"]

# dictionary to store trained models
models = {}

for file in files:
    ticker = file.split(".")[0]
    df = pd.read_csv(os.path.join(DATA_DIR, file))
    
    # create target column for next day close
    df['target'] = df['Close'].shift(-1)
    df.dropna(inplace=True)
    
    X = df[FEATURES]
    y = df['target']
    
    # simple time-series split: first 80% train, last 20% test
    split = int(len(df)*0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    
    # train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    models[ticker] = model
    
    # save model
    joblib.dump(model, os.path.join(MODEL_DIR, f"{ticker}_rf.pkl"))
    print(f"Trained and saved model for {ticker}")