In [274]:
import yfinance as yf
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, classification_report

In [275]:
def fetch_stock_data(ticker, start_date="1990-01-02 00:00:00-05:00", end_date=None):
    stock = yf.Ticker(ticker)
    stock_data = stock.history(start=start_date, end=end_date)
    stock_data.drop(["Dividends", "Stock Splits"], axis=1, inplace=True)
    return stock_data

In [276]:
def create_target_column(data, horizon=1):
    data["Tomorrow"] = data["Close"].shift(-horizon)
    data["Target"] = (data["Tomorrow"] > data["Close"]).astype(int)
    return data

In [277]:
def train_random_forest_model(train_data, predictors, target):
    model = RandomForestClassifier(n_estimators=500, min_samples_split=50, random_state=1)
    model.fit(train_data[predictors], train_data[target])
    return model

In [278]:
def generate_additional_ticker_features(sp500, ticker_data, horizon, ticker):
    ticker_data = create_target_column(ticker_data, horizon=1)
    ticker_predictors = ["Close", "Volume", "Open", "High", "Low"]
    
    # Train model for the additional ticker
    ticker_model = train_random_forest_model(ticker_data, ticker_predictors, "Target")
    
    # Generate predictions for the additional ticker
    ticker_predictions = predict(ticker_data, ticker_data, ticker_predictors, ticker_model)
    
    # Add the predictions as a feature for the S&P 500 dataframe
    sp500[f"Prediction_{ticker}"] = ticker_predictions["Predictions"]

    # Forward-fill missing values in the dataframe
    sp500 = sp500.ffill()

    return sp500

In [279]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:, 1]
    preds[preds >= 0.6] = 1
    preds[preds < 0.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [280]:
def generate_features(data, horizons):
    new_predictors = []

    for horizon in horizons:
        rolling_averages = data["Close"].rolling(horizon).mean()

        ratio_column = f"Close_Ratio_{horizon}"
        data[ratio_column] = data["Close"] / rolling_averages

        trend_column = f"Trend_{horizon}"
        data[trend_column] = data.shift(1).rolling(horizon).sum()["Target"]

        new_predictors += [ratio_column, trend_column]

    data = data.dropna(subset=data.columns[data.columns != "Target"])
    return data, new_predictors

In [281]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

    return pd.concat(all_predictions)

In [282]:
def calculate_precision(predictions):
    precision = precision_score(predictions["Target"], predictions["Predictions"])
    print("Precision:", precision)

In [283]:
def display_value_counts(predictions):
    value_counts = predictions["Predictions"].value_counts()
    print("Value Counts of Predictions:")
    print(value_counts)

In [284]:
def display_actual_value_counts(data):
    actual_value_counts = data["Target"].value_counts()
    print("Actual Value Counts:")
    print(actual_value_counts)

In [285]:
sp500 = fetch_stock_data("^GSPC", start_date="1990-01-02")
sp500 = create_target_column(sp500, horizon=1)

# Train the S&P 500 model first
predictors = ["Close", "Volume", "Open", "High", "Low"]
model = train_random_forest_model(sp500, predictors, "Target")

# Process data/train model for additional tickers
additional_tickers = ["AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "META", "GOOG", "TSLA", "UNH"]
for ticker in additional_tickers:
    additional_data = fetch_stock_data(ticker, start_date="1990-01-02")
    sp500 = generate_additional_ticker_features(sp500, additional_data, horizon=1, ticker=ticker)

combined_predictors = predictors + [f"Prediction_{ticker}" for ticker in additional_tickers]

sp500 = sp500.ffill().dropna()

final_model = train_random_forest_model(sp500, combined_predictors, "Target")

backtest_results = backtest(sp500, final_model, combined_predictors)
calculate_precision(backtest_results)
display_value_counts(backtest_results)

Precision: 0.7663043478260869
Value Counts of Predictions:
Predictions
0.0    226
1.0    184
Name: count, dtype: int64


In [286]:
backtest_results

Unnamed: 0_level_0,Target,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-04-26 00:00:00-04:00,1,0.0
2022-04-27 00:00:00-04:00,1,1.0
2022-04-28 00:00:00-04:00,0,0.0
2022-04-29 00:00:00-04:00,1,1.0
2022-05-02 00:00:00-04:00,1,1.0
...,...,...
2023-12-05 00:00:00-05:00,0,0.0
2023-12-06 00:00:00-05:00,1,1.0
2023-12-07 00:00:00-05:00,1,1.0
2023-12-08 00:00:00-05:00,1,0.0
