In [19]:
import yfinance as yf
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from FRED_data import fetch_fred_data, important_series

In [20]:
def fetch_stock_data(ticker, start_date="1990-01-02 00:00:00-05:00", end_date=None):
    stock = yf.Ticker(ticker)
    stock_data = stock.history(start=start_date, end=end_date)
    
    columns_to_drop = ["Dividends", "Stock Splits"]
    stock_data = stock_data.drop(columns=[col for col in columns_to_drop if col in stock_data.columns], errors='ignore')

    stock_data = calculate_magnitude(stock_data, window=10)
    stock_data = calculate_velocity(stock_data, window=10)

    return stock_data

In [21]:
def create_economic_data_column(data, economic_data):
    for series_id in economic_data.columns:
        economic_data_series = economic_data[series_id].tz_localize('UTC').tz_convert('America/New_York')
        economic_data_series = economic_data_series.reindex(data.index, method='ffill')
        data[f"{series_id}"] = economic_data_series

    return data

In [22]:
def calculate_velocity(data, window=10):
    data['Velocity'] = data['Close'].pct_change() * 100 
    data['Velocity'] = data['Velocity'].rolling(window=window).mean()
    return data

In [23]:
def calculate_magnitude(data, window=10):
    data['Magnitude'] = data['High'] - data['Low']
    data['Magnitude'] = data['Magnitude'].rolling(window=window).mean()
    return data

In [24]:
def create_target_column(data, horizon=1):
    data["Tomorrow"] = data["Close"].shift(-horizon)
    data["Target"] = (data["Tomorrow"] > data["Close"]).astype(int)
    return data

In [25]:
def train_random_forest_model(train_data, predictors, target):
    train_data[predictors] = train_data[predictors].fillna(0)

    model = RandomForestClassifier(n_estimators=100, min_samples_split=50, random_state=1)
    model.fit(train_data[predictors], train_data[target])
    return model

In [26]:
def generate_additional_ticker_features(sp500, ticker_data, horizon, ticker):
    ticker_data = create_target_column(ticker_data, horizon=1)
    ticker_predictors = ["Close", "Volume", "Open", "High", "Low"]
        
    # Train model for the additional ticker
    ticker_model = train_random_forest_model(ticker_data, ticker_predictors, "Target")
    
    # Generate predictions for the additional ticker
    ticker_predictions = predict(ticker_data, ticker_data, ticker_predictors, ticker_model)
    
    # Create a new dataframe to store predictions for the additional ticker
    ticker_predictions_df = pd.DataFrame(index=sp500.index)
    ticker_predictions_df[f"Prediction_{ticker}"] = ticker_predictions["Predictions"]

    return ticker_predictions_df

In [27]:
def predict(train, test, predictors, model):
    test[predictors] = test[predictors].fillna(0)
    
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:, 1]
    preds[preds >= 0.6] = 1
    preds[preds < 0.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [28]:
def generate_features(data, horizons):
    new_predictors = []

    for horizon in horizons:
        rolling_averages = data["Close"].rolling(horizon).mean()

        ratio_column = f"Close_Ratio_{horizon}"
        data[ratio_column] = data["Close"] / rolling_averages

        trend_column = f"Trend_{horizon}"
        data[trend_column] = data.shift(1).rolling(horizon).sum()["Target"]

        new_predictors += [ratio_column, trend_column]

    data = data.dropna(subset=data.columns[data.columns != "Target"])
    return data, new_predictors

In [29]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []

    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)

        data.loc[test.index, 'Predictions_SP500'] = predictions['Predictions']

    return pd.concat(all_predictions)

In [30]:
def calculate_precision(predictions):
    precision = precision_score(predictions["Target"], predictions["Predictions"])
    print("Precision:", precision)

In [31]:
def display_value_counts(predictions):
    value_counts = predictions["Predictions"].value_counts()
    print("Value Counts of Predictions:")
    print(value_counts)

In [32]:
def display_actual_value_counts(data):
    actual_value_counts = data["Target"].value_counts()
    print("Actual Value Counts:")
    print(actual_value_counts)

In [33]:
sp500 = fetch_stock_data("^GSPC", start_date="1990-01-02")
sp500 = create_target_column(sp500, horizon=1)

# Train the S&P 500 model first
predictors = ["Close", "Volume", "Open", "High", "Low"]
model = train_random_forest_model(sp500, predictors, "Target")

# Process data/train model for additional tickers
# additional_tickers = ["AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "META", "GOOG", "TSLA", "UNH"]
additional_tickers = ["AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "GOOG", "UNH"]
additional_predictions_dfs = []

for ticker in additional_tickers:
    additional_data = fetch_stock_data(ticker, start_date="1990-01-02")
    ticker_predictions_df = generate_additional_ticker_features(sp500, additional_data, horizon=1, ticker=ticker)
    additional_predictions_dfs.append(ticker_predictions_df)

# Merge all additional ticker predictions into the S&P 500 dataframe
for i, ticker in enumerate(additional_tickers):
    sp500 = pd.merge(sp500, additional_predictions_dfs[i], left_index=True, right_index=True)

combined_predictors = predictors + [f"Prediction_{ticker}" for ticker in additional_tickers]

sp500 = sp500.ffill().dropna()

final_model = train_random_forest_model(sp500, combined_predictors, "Target")

backtest_results = backtest(sp500, final_model, combined_predictors)
calculate_precision(backtest_results)
display_value_counts(backtest_results)

sp500['Predictions_SP500'] = backtest_results['Predictions']

Precision: 0.7555831265508685
Value Counts of Predictions:
Predictions
0.0    1559
1.0     806
Name: count, dtype: int64


In [34]:
backtest_results

Unnamed: 0_level_0,Target,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-07-25 00:00:00-04:00,1,0.0
2014-07-28 00:00:00-04:00,0,0.0
2014-07-29 00:00:00-04:00,1,1.0
2014-07-30 00:00:00-04:00,0,0.0
2014-07-31 00:00:00-04:00,0,0.0
...,...,...
2023-12-08 00:00:00-05:00,1,1.0
2023-12-11 00:00:00-05:00,1,1.0
2023-12-12 00:00:00-05:00,1,0.0
2023-12-13 00:00:00-05:00,1,0.0


In [35]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Magnitude,Velocity,Tomorrow,Target,Prediction_AAPL,Prediction_MSFT,Prediction_AMZN,Prediction_NVDA,Prediction_GOOGL,Prediction_GOOG,Prediction_UNH,Predictions_SP500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2004-08-19 00:00:00-04:00,1095.170044,1095.170044,1086.280029,1091.229980,1249400000,11.916016,0.101541,1098.349976,1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,
2004-08-20 00:00:00-04:00,1091.229980,1100.260010,1089.569946,1098.349976,1199900000,11.138025,0.321596,1095.680054,0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,
2004-08-23 00:00:00-04:00,1098.349976,1101.400024,1094.729980,1095.680054,1021900000,11.256030,0.285539,1096.189941,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,
2004-08-24 00:00:00-04:00,1095.680054,1100.939941,1092.819946,1096.189941,1092500000,10.686023,0.160453,1104.959961,1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,
2004-08-25 00:00:00-04:00,1096.189941,1106.290039,1093.239990,1104.959961,1192200000,10.679028,0.270577,1105.089966,1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-08 00:00:00-05:00,4576.200195,4609.229980,4574.060059,4604.370117,3707010000,31.729053,0.099176,4622.439941,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2023-12-11 00:00:00-05:00,4593.390137,4623.709961,4593.390137,4622.439941,3823210000,33.341016,0.157963,4643.700195,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2023-12-12 00:00:00-05:00,4618.299805,4643.930176,4608.089844,4643.700195,3808380000,34.162012,0.194155,4707.089844,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-12-13 00:00:00-05:00,4646.200195,4709.689941,4643.229980,4707.089844,5063650000,36.758984,0.340124,4719.549805,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
