In [114]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [115]:
sp500 = yf.Ticker("^GSPC")

In [116]:
sp500 = sp500.history(period="max")
del sp500["Dividends"]
del sp500["Stock Splits"]

In [117]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500 = sp500.loc["1990-01-02 00:00:00-05:00":].copy()

0: Stock price goes down 
1: Stock price goes


In [118]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [119]:
def preprocess_data(data):
    scaler = StandardScaler()
    data[predictors] = scaler.fit_transform(data[predictors])
    return data

In [120]:
def predict(train, test, predictors, model):
    """
    Train the model on the training data and make predictions on the test data.

    Parameters:
    - train: Training data
    - test: Test data
    - predictors: List of feature names
    - model: Machine learning model

    Returns:
    - combined: DataFrame containing actual and predicted values
    """
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)

    return combined

In [121]:
from sklearn.metrics import classification_report

def backtest(data, model, predictors, n_splits=5):
    all_predictions = []
    np.random.seed(1)
    tscv = TimeSeriesSplit(n_splits=n_splits)

    for train_idx, test_idx in tscv.split(data):
        train, test = data.iloc[train_idx], data.iloc[test_idx].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        print(f"Evaluation for fold {len(all_predictions)}:")
        print(classification_report(test["Target"], predictions["Predictions"]))

    return all_predictions


In [122]:
def generate_features(data, horizons):
    new_predictors = []

    for horizon in horizons:
        rolling_averages = data["Close"].rolling(horizon).mean()

        ratio_column = f"Close_Ratio_{horizon}"
        trend_column = f"Trend_{horizon}"

        data[ratio_column] = data["Close"] / rolling_averages
        data[trend_column] = data.shift(1).rolling(horizon).sum()["Target"]

        new_predictors += [ratio_column, trend_column]

    data = data.dropna(subset=data.columns[data.columns != "Target"])

    return data, new_predictors


In [123]:
def calculate_precision(predictions):
    precision = precision_score(predictions["Target"], predictions["Predictions"])
    print("Precision:", precision)

In [124]:
def display_value_counts(predictions):
    value_counts = predictions["Predictions"].value_counts()
    print("Value Counts of Predictions:")
    print(value_counts)

In [133]:
def display_actual_value_counts(data):
    actual_value_counts = data["Target"].value_counts()
    print("Actual Value Counts:")
    print(actual_value_counts)

In [125]:
sp500, new_predictors = generate_features(sp500, [2, 5, 60, 250, 1000])
predictions = backtest(sp500, model, new_predictors)
combined_predictions = pd.concat(predictions)

Evaluation for fold 1:
              precision    recall  f1-score   support

           0       0.52      0.22      0.31       643
           1       0.49      0.78      0.60       615

    accuracy                           0.50      1258
   macro avg       0.50      0.50      0.46      1258
weighted avg       0.50      0.50      0.45      1258

Evaluation for fold 2:
              precision    recall  f1-score   support

           0       0.46      0.75      0.57       570
           1       0.57      0.28      0.37       688

    accuracy                           0.49      1258
   macro avg       0.51      0.51      0.47      1258
weighted avg       0.52      0.49      0.46      1258

Evaluation for fold 3:
              precision    recall  f1-score   support

           0       0.45      0.81      0.58       557
           1       0.58      0.21      0.31       701

    accuracy                           0.48      1258
   macro avg       0.51      0.51      0.44      1258
weigh

In [130]:
calculate_precision(combined_predictions)

Precision: 0.527806385169928


In [131]:
display_value_counts(combined_predictions)

Value Counts of Predictions:
Predictions
0.0    4348
1.0    1942
Name: count, dtype: int64


In [134]:
display_actual_value_counts(sp500)

Actual Value Counts:
Target
1    4057
0    3493
Name: count, dtype: int64
