In [77]:
import yfinance as yf
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report

In [78]:
sp500 = yf.Ticker("^GSPC")

In [79]:
sp500 = sp500.history(period="max")
del sp500["Dividends"]
del sp500["Stock Splits"]

In [80]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500 = sp500.loc["1990-01-02 00:00:00-05:00":].copy()

0: Stock price goes down 
1: Stock price goes


In [92]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [None]:
def preprocess_data(data):
    scaler = StandardScaler()
    data[predictors] = scaler.fit_transform(data[predictors])
    return data

In [93]:
def predict(train, test, predictors, model):
    """
    Train the model on the training data and make predictions on the test data.

    Parameters:
    - train: Training data
    - test: Test data
    - predictors: List of feature names
    - model: Machine learning model

    Returns:
    - combined: DataFrame containing actual and predicted values
    """
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [95]:
from sklearn.metrics import classification_report

def backtest(data, model, predictors, n_splits=5):
    all_predictions = []
    tscv = TimeSeriesSplit(n_splits=n_splits)

    for train_idx, test_idx in tscv.split(data):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        print(f"Evaluation for fold {len(all_predictions)}:")
        print(classification_report(test["Target"], predictions["Predictions"]))

    return all_predictions


In [96]:
def generate_features(data, horizons):
    new_predictors = []

    # Drop NaN values based on columns other than "Tomorrow"
    data = data.dropna(subset=data.columns[data.columns != "Tomorrow"])

    for horizon in horizons:
        rolling_averages = data["Close"].rolling(horizon).mean()

        ratio_column = f"Close_Ratio_{horizon}"
        trend_column = f"Trend_{horizon}"

        data[ratio_column] = data["Close"] / rolling_averages
        data[trend_column] = data.shift(1).rolling(horizon).sum()["Target"]

        new_predictors += [ratio_column, trend_column]

    return data, new_predictors

In [97]:
predictions = backtest(sp500, model, new_predictors)
combined_predictions = pd.concat(predictions)
value_counts = combined_predictions["Predictions"].value_counts()

print("Value Counts of Predictions:")
print(value_counts)

NameError: name 'TimeSeriesSplit' is not defined

In [91]:
predictions = backtest(sp500, model, new_predictors)
combined_predictions = pd.concat(predictions)

precision = precision_score(combined_predictions["Target"], combined_predictions["Predictions"])
print("Precision:", precision)

Precision: 0.572289156626506
