In [80]:
# Import necessary libraries
import yfinance as yf 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

In [81]:
# Fetch historical data for S&P 500
sp500 = yf.Ticker("^FTSE")
sp500 = sp500.history(period="max")
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1984-01-03 00:00:00+00:00,997.500000,1001.400024,997.500000,997.500000,0,0.0,0.0
1984-01-04 00:00:00+00:00,997.500000,999.500000,993.299988,998.599976,0,0.0,0.0
1984-01-05 00:00:00+00:00,1007.099976,1015.799988,1007.099976,1015.799988,0,0.0,0.0
1984-01-06 00:00:00+00:00,1019.000000,1029.300049,1019.000000,1029.000000,0,0.0,0.0
1984-01-09 00:00:00+00:00,1030.599976,1035.400024,1030.599976,1034.599976,0,0.0,0.0
...,...,...,...,...,...,...,...
2024-08-19 00:00:00+01:00,8311.400391,8374.000000,8286.500000,8356.900391,637975500,0.0,0.0
2024-08-20 00:00:00+01:00,8356.900391,8356.900391,8270.700195,8273.299805,492210300,0.0,0.0
2024-08-21 00:00:00+01:00,8273.299805,8295.900391,8263.000000,8283.400391,825049200,0.0,0.0
2024-08-22 00:00:00+01:00,8283.400391,8318.599609,8277.000000,8288.000000,797295800,0.0,0.0


In [82]:
# Drop unnecessary columns
del sp500["Stock Splits"]
del sp500["Dividends"]

In [83]:
# Create target variable: whether the open price of the next day is higher than the close price of today
sp500["Open_Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Open_Tomorrow"] > sp500["Close"]).astype(int)

In [85]:
# Select data from 2000 onwards
sp500 = sp500.loc["2000-01-01":].copy()
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Open_Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-04 00:00:00+00:00,6930.200195,6930.200195,6662.899902,6665.899902,633449000,6535.899902,0
2000-01-05 00:00:00+00:00,6665.899902,6665.899902,6500.399902,6535.899902,670234000,6447.200195,0
2000-01-06 00:00:00+00:00,6535.899902,6547.299805,6424.399902,6447.200195,785532000,6504.799805,1
2000-01-07 00:00:00+00:00,6447.200195,6532.399902,6438.799805,6504.799805,888306000,6607.700195,1
2000-01-10 00:00:00+00:00,6504.799805,6634.200195,6504.799805,6607.700195,735455000,6518.899902,0
...,...,...,...,...,...,...,...
2024-08-19 00:00:00+01:00,8311.400391,8374.000000,8286.500000,8356.900391,637975500,8273.299805,0
2024-08-20 00:00:00+01:00,8356.900391,8356.900391,8270.700195,8273.299805,492210300,8283.400391,1
2024-08-21 00:00:00+01:00,8273.299805,8295.900391,8263.000000,8283.400391,825049200,8288.000000,1
2024-08-22 00:00:00+01:00,8283.400391,8318.599609,8277.000000,8288.000000,797295800,8327.799805,1


In [86]:
# Initialize and train a RandomForest model
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)



In [87]:
# Define training and testing datasets
train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

In [88]:
# Define predictors
predictors = ["Close", "Volume", "Open", "High", "Low"]

In [89]:
# Fit the model on the training data
model.fit(train[predictors], train["Target"])

In [90]:
# Make predictions on the test set
preds = model.predict(test[predictors])

In [91]:
# Convert predictions to a pandas Series for ease of analysis
preds = pd.Series(preds, index=test.index)

In [92]:
# Evaluate initial model performance
precision_score(test["Target"], preds)

0.6

In [93]:
# Combine actual and predicted values for further analysis
combined = pd.concat([test["Target"], preds], axis=1)

In [94]:
# Function to fit model and make predictions
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [95]:
# Function to perform backtesting
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)
    

In [96]:
# Perform backtesting
predictions = backtest(sp500, model, predictors)

In [97]:
# Analyze predictions
predictions["Predictions"].value_counts()

0    2182
1    1543
Name: Predictions, dtype: int64

In [98]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5418016850291639

In [99]:
# Define horizons for feature engineering
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors += [ratio_column, trend_column]

new_predictors += predictors

In [100]:
# Drop rows with NaN values resulting from feature engineering
sp500 = sp500.dropna()

In [101]:
# Initialize and train a new RandomForest model with updated parameters
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [102]:
# Perform backtesting with new features
predictions = backtest(sp500, model, new_predictors)

In [103]:
# Analyze predictions following new features
predictions["Predictions"].value_counts()

0    1534
1    1190
Name: Predictions, dtype: int64

In [104]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5470588235294118

In [105]:
# Refine further with ensemble approach

In [106]:
# Define multiple models to start ensemble approach
xgb_model = XGBClassifier()
rf_model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [107]:
# Ensemble model
ensemble_model = VotingClassifier(estimators=[('rf', rf_model), ('xgb', xgb_model)], voting='soft')

In [108]:
# Train and predict using the ensemble model
ensemble_model.fit(train[predictors], train["Target"])
ensemble_preds = ensemble_model.predict_proba(test[predictors])[:, 1]

In [109]:
# Convert probabilities to binary predictions
ensemble_preds[ensemble_preds >= 0.6] = 1
ensemble_preds[ensemble_preds < 0.6] = 0

In [110]:
# Evaluate ensemble model
ensemble_precision = precision_score(test["Target"], ensemble_preds)
print("Ensemble Precision Score:", ensemble_precision)

Ensemble Precision Score: 0.5


In [111]:
# Prepare the latest available data
latest_data = sp500.iloc[-1:]  # Get the most recent row of data

In [112]:
# Calculate new features for the latest data
latest_data = latest_data.copy()

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    
    ratio_column = f"Close_Ratio_{horizon}"
    trend_column = f"Trend_{horizon}"
    
    # Calculate the ratio and trend
    latest_data[ratio_column] = latest_data["Close"] / rolling_averages["Close"].iloc[-1]
    latest_data[trend_column] = latest_data.shift(1).rolling(horizon).sum()["Target"].iloc[-1]

In [113]:
# Ensure the latest_data has all predictors
latest_data = latest_data[predictors]

In [114]:
# Use the trained ensemble model to make predictions
latest_preds = ensemble_model.predict_proba(latest_data)[:, 1]  # Probability of the positive class
latest_preds_binary = (latest_preds >= 0.6).astype(int)  # Convert to binary prediction

In [115]:
f"Predicted probability for the next day: {latest_preds[0]}"

'Predicted probability for the next day: 0.321754560998294'

In [116]:
f"Binary prediction (1 for up, 0 for down): {latest_preds_binary[0]}"

'Binary prediction (1 for up, 0 for down): 0'