# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['HDFCBANK', 'ITBEES', 'JUBLFOOD']

In [4]:
STOCK_SYMBOL = stock_symbols[2]
STOCK_SYMBOL

'JUBLFOOD'

## Data loading

### Stock data

In [5]:
stock_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-processed.parquet')
)

stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,2020-01-01,1656.95,1673.00,1652.80,1664.6,1663.15,1664.37,1673.00,1077.9,1142694,1.901871e+09,27428,20.20,1,0,1,1,1,2020,2,1,1,1,1663.15,1663.15,1663.15,1663.15,20.20,20.20,20.20,20.20,1664.37,1664.37,1664.37,1664.37,1142694,1142694,1142694,1142694,1901871196,1901871196,1901871196,1901871196,27428,27428,27428,27428,1679.60,1727.80,1773.80,1825.55
1,2020-01-02,1665.00,1688.35,1655.50,1686.0,1682.70,1675.06,1688.35,1077.9,840514,1.407907e+09,26148,32.85,1,0,1,2,1,2020,3,2,1,1,1672.93,1672.93,1672.93,1672.93,26.52,26.52,26.52,26.52,1669.72,1669.72,1669.72,1669.72,991604,991604,991604,991604,1654889284,1654889284,1654889284,1654889284,26788,26788,26788,26788,1693.65,1731.35,1793.55,1820.00
2,2020-01-03,1681.00,1696.90,1668.05,1679.0,1682.75,1685.46,1696.90,1077.9,975751,1.644585e+09,26219,28.85,1,0,1,3,1,2020,4,3,1,1,1676.20,1676.20,1676.20,1676.20,27.30,27.30,27.30,27.30,1674.96,1674.96,1674.96,1674.96,986319,986319,986319,986319,1651454555,1651454555,1651454555,1651454555,26598,26598,26598,26598,1725.70,1720.50,1810.10,1835.10
3,2020-01-06,1670.30,1688.85,1664.25,1679.0,1679.60,1676.81,1696.90,1077.9,586899,9.841203e+08,20012,24.60,1,0,0,6,1,2020,0,6,1,3,1677.05,1677.05,1677.05,1677.05,26.62,26.62,26.62,26.62,1675.42,1675.42,1675.42,1675.42,886464,886464,886464,886464,1484620980,1484620980,1484620980,1484620980,24951,24951,24951,24951,1714.30,1753.45,1822.90,1812.60
4,2020-01-07,1688.00,1702.50,1684.05,1689.4,1693.65,1693.57,1702.50,1077.9,845385,1.431717e+09,23848,18.45,1,0,1,7,1,2020,1,7,1,1,1680.37,1680.37,1680.37,1680.37,24.99,24.99,24.99,24.99,1679.05,1679.05,1679.05,1679.05,878248,878248,878248,878248,1474040255,1474040255,1474040255,1474040255,24731,24731,24731,24731,1727.80,1746.70,1754.30,1833.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,2024-01-29,508.75,518.45,504.10,518.0,513.15,510.72,586.95,412.1,6212204,3.172690e+09,77868,14.35,1,0,0,29,1,2024,0,29,1,4,520.95,524.86,544.52,539.78,14.04,13.58,13.55,11.82,520.32,525.42,545.46,540.22,3169424,2587621,2427123,2134424,1640447875,1353563596,1320179854,1158761061,64065,56940,50662,42078,501.90,,,
1028,2024-01-30,517.15,521.90,508.05,513.0,510.15,513.73,586.95,412.1,2464481,1.266071e+09,34342,13.85,0,0,0,30,1,2024,1,30,1,1,518.93,523.25,542.67,539.93,14.17,12.90,13.50,11.96,519.10,523.67,543.66,540.42,3160963,2591987,2453664,2166395,1633013466,1351599355,1330811888,1175295089,58966,54449,50569,42443,494.85,,,
1029,2024-01-31,514.00,526.15,509.50,518.2,519.55,518.54,586.95,412.1,4259420,2.208667e+09,92424,16.65,1,0,0,31,1,2024,2,31,1,1,517.20,522.91,541.35,540.14,15.08,12.91,13.80,12.12,517.10,522.96,542.20,540.61,3445825,2697419,2544398,2211641,1776201812,1404377593,1375624151,1199049589,60480,58094,52034,43717,,,,
1030,2024-02-01,513.00,518.45,498.10,501.4,501.90,505.93,586.95,412.1,6644517,3.361652e+09,131585,20.35,0,0,0,1,2,2024,3,32,1,1,514.16,521.51,539.25,540.04,15.99,13.71,13.99,12.34,514.12,521.94,540.13,540.57,4301178,3028044,2686249,2305913,2206989109,1569930520,1442436250,1246708400,76467,64211,54870,45393,,,,


### Standardized data

In [6]:
standardized_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-standardized.parquet')
)

standardized_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.996,1.006,0.994,1.001,1.001,1.006,0.648,1,0,1,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.012,0.012,0.012,0.012,1.001,1.001,1.001,1.001,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.010,1.039,1.067,1.098
1,0.989,1.003,0.984,1.002,0.995,1.003,0.641,1,0,1,2,1,2020,3,2,1,1,0.994,0.994,0.994,0.994,0.016,0.016,0.016,0.016,0.992,0.992,0.992,0.992,1.180,1.180,1.180,1.180,1.175,1.175,1.175,1.175,1.024,1.024,1.024,1.024,1.007,1.029,1.066,1.082
2,0.999,1.008,0.991,0.998,1.002,1.008,0.641,1,0,1,3,1,2020,4,3,1,1,0.996,0.996,0.996,0.996,0.016,0.016,0.016,0.016,0.995,0.995,0.995,0.995,1.011,1.011,1.011,1.011,1.004,1.004,1.004,1.004,1.014,1.014,1.014,1.014,1.026,1.022,1.076,1.091
3,0.994,1.006,0.991,1.000,0.998,1.010,0.642,1,0,0,6,1,2020,0,6,1,3,0.998,0.998,0.998,0.998,0.016,0.016,0.016,0.016,0.998,0.998,0.998,0.998,1.510,1.510,1.510,1.510,1.509,1.509,1.509,1.509,1.247,1.247,1.247,1.247,1.021,1.044,1.085,1.079
4,0.997,1.005,0.994,0.997,1.000,1.005,0.636,1,0,1,7,1,2020,1,7,1,1,0.992,0.992,0.992,0.992,0.015,0.015,0.015,0.015,0.991,0.991,0.991,0.991,1.039,1.039,1.039,1.039,1.030,1.030,1.030,1.030,1.037,1.037,1.037,1.037,1.020,1.031,1.036,1.082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,0.991,1.010,0.982,1.009,0.995,1.144,0.803,1,0,0,29,1,2024,0,29,1,4,1.015,1.023,1.061,1.052,0.027,0.026,0.026,0.023,1.014,1.024,1.063,1.053,0.510,0.417,0.391,0.344,0.517,0.427,0.416,0.365,0.823,0.731,0.651,0.540,0.978,,,
1028,1.014,1.023,0.996,1.006,1.007,1.151,0.808,0,0,0,30,1,2024,1,30,1,1,1.017,1.026,1.064,1.058,0.028,0.025,0.026,0.023,1.018,1.027,1.066,1.059,1.283,1.052,0.996,0.879,1.290,1.068,1.051,0.928,1.717,1.585,1.473,1.236,0.970,,,
1029,0.989,1.013,0.981,0.997,0.998,1.130,0.793,1,0,0,31,1,2024,2,31,1,1,0.995,1.006,1.042,1.040,0.029,0.025,0.027,0.023,0.995,1.007,1.044,1.041,0.809,0.633,0.597,0.519,0.804,0.636,0.623,0.543,0.654,0.629,0.563,0.473,,,,
1030,1.022,1.033,0.992,0.999,1.008,1.169,0.821,0,0,0,1,2,2024,3,32,1,1,1.024,1.039,1.074,1.076,0.032,0.027,0.028,0.025,1.024,1.040,1.076,1.077,0.647,0.456,0.404,0.347,0.657,0.467,0.429,0.371,0.581,0.488,0.417,0.345,,,,


## Modelling

### Target columns

In [7]:
target_cols = standardized_df.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [8]:
standardized_df[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1029.0,1025.0,1017.0,1002.0
mean,0.999603,0.99888,0.997663,0.995811
std,0.057643,0.08587,0.125663,0.180719
min,0.197,0.187,0.169,0.166
25%,0.982,0.973,0.955,0.931
50%,1.001,1.004,1.004,1.019
75%,1.023,1.036,1.061,1.096
max,1.22,1.232,1.222,1.353


### Data processing

In [9]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = standardized_df[standardized_df[target_col].notnull()].drop(columns = target_cols).copy()
    y = standardized_df[standardized_df[target_col].notnull()][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [10]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = standardized_df.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [11]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 10,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [12]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1029, 41)
y.shape: (1029,)


In [13]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [14]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.341125,0.022682,0.043078,0.002382,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.105762,-0.203451,-0.042683,-0.14541,-0.597828,-0.219027,0.196508,1
5,0.411941,0.037371,0.047566,0.007922,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.170757,-0.180884,-0.049039,-0.243704,-0.639029,-0.256683,0.201297,2
8,0.326654,0.030949,0.041847,0.003999,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.287024,-0.222525,-0.048034,-0.082304,-0.703645,-0.268706,0.234556,3
1,0.224469,0.005449,0.030149,0.000887,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.197486,-0.166144,-0.050774,-0.523577,-0.546341,-0.296864,0.20058,4
4,0.229358,0.014892,0.032408,0.00537,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.202419,-0.297502,-0.045524,-0.246255,-1.098945,-0.378129,0.370126,5


In [15]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.058
R2: 0.382
MSE: 0.045
MAE: 0.027


In [16]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1019,2024-01-16,530.4,531.65,528.03
1020,2024-01-17,527.7,523.2,525.07
1021,2024-01-18,524.3,520.6,521.89
1022,2024-01-19,531.65,525.5,529.16
1023,2024-01-20,523.2,508.25,520.6
1024,2024-01-23,520.6,513.15,517.95
1025,2024-01-24,525.5,510.15,522.59
1026,2024-01-25,508.25,519.55,505.55
1027,2024-01-29,513.15,501.9,509.95
1028,2024-01-30,510.15,494.85,506.27


### `Target 7D`

In [17]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1025, 41)
y.shape: (1025,)


In [18]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [19]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.334493,0.0279,0.041558,0.004745,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.595097,-0.213687,-0.105277,-0.524685,-1.512798,-0.590309,0.496471,1
5,0.405978,0.016638,0.04755,0.005247,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.858605,-0.350368,-0.103269,-0.277671,-2.018555,-0.721694,0.695425,2
8,0.295192,0.002531,0.040395,0.003196,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.727444,-0.237336,-0.098432,-0.967495,-1.624255,-0.730993,0.547301,3
4,0.266855,0.021942,0.036038,0.005487,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.595197,-0.274468,-0.103693,-0.488965,-2.244738,-0.741412,0.770691,4
1,0.253323,0.015195,0.031349,0.000811,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.771311,-0.322124,-0.106815,-0.866732,-1.953157,-0.804028,0.639349,5


In [20]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.086
R2: 0.523
MSE: 0.059
MAE: 0.039


In [21]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1015,2024-01-10,522.9,531.65,517.84
1016,2024-01-11,527.7,523.2,522.65
1017,2024-01-12,525.65,520.6,520.12
1018,2024-01-15,532.95,525.5,528.18
1019,2024-01-16,530.4,508.25,524.38
1020,2024-01-17,527.7,513.15,522.23
1021,2024-01-18,524.3,510.15,519.13
1022,2024-01-19,531.65,519.55,526.49
1023,2024-01-20,523.2,501.9,513.65
1024,2024-01-23,520.6,494.85,510.76


### `Target 15D`

In [22]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1017, 41)
y.shape: (1017,)


In [23]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [24]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.296049,0.00659,0.042908,0.001182,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.589624,-0.464989,-0.165404,-0.834354,-1.782239,-0.767322,0.551332,1
8,0.321486,0.019383,0.041836,0.003247,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.819337,-0.349367,-0.181787,-1.11141,-1.74722,-0.841824,0.560551,2
3,0.261965,0.017718,0.033827,0.004036,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.952617,-0.463546,-0.215755,-1.670887,-1.482192,-0.956999,0.561858,3
5,0.436917,0.034237,0.050057,0.004468,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.949453,-0.525073,-0.209537,-1.140407,-2.022056,-0.969305,0.618473,4
1,0.272377,0.024552,0.036435,0.005841,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.832803,-0.674324,-0.210947,-1.212188,-2.258859,-1.037824,0.689815,5


In [25]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.126
R2: 0.597
MSE: 0.080
MAE: 0.055


In [26]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1007,2023-12-29,565.05,531.65,566.03
1008,2024-01-01,561.55,523.2,547.51
1009,2024-01-02,558.6,520.6,537.53
1010,2024-01-03,554.0,525.5,537.08
1011,2024-01-04,561.15,508.25,545.2
1012,2024-01-05,555.55,513.15,542.46
1013,2024-01-08,534.35,510.15,518.73
1014,2024-01-09,524.6,519.55,507.37
1015,2024-01-10,522.9,501.9,506.36
1016,2024-01-11,527.7,494.85,511.65


### `Target 30D`

In [27]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (1002, 41)
y.shape: (1002,)


In [28]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [29]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.342728,0.024187,0.044693,0.001511,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-1.151616,-1.334395,-0.570081,-1.883484,-3.091856,-1.606287,0.85301,1
8,0.355658,0.023491,0.045982,0.003799,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-1.455326,-1.255291,-0.594135,-2.151575,-2.888354,-1.668936,0.786365,2
4,0.241629,0.004278,0.042194,0.022669,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-1.641196,-1.182404,-0.556987,-1.465153,-4.042478,-1.777644,1.190803,3
0,0.396392,0.06297,0.047873,0.005868,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-1.532617,-1.456461,-0.591991,-2.367521,-3.308106,-1.851339,0.920009,4
1,0.24714,0.006867,0.035623,0.005137,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-1.445402,-1.374358,-0.615038,-2.180784,-4.518471,-2.026811,1.340814,5


In [30]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.181
R2: 0.644
MSE: 0.108
MAE: 0.078


In [31]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
992,2023-12-07,561.0,531.65,559.06
993,2023-12-08,559.75,523.2,557.41
994,2023-12-11,560.55,520.6,558.96
995,2023-12-12,558.3,525.5,558.55
996,2023-12-13,556.15,508.25,558.21
997,2023-12-14,569.05,513.15,567.13
998,2023-12-15,565.45,510.15,564.57
999,2023-12-18,559.1,519.55,558.71
1000,2023-12-19,564.95,501.9,563.52
1001,2023-12-20,562.35,494.85,563.18


## Feature importances

In [32]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
DayOfYear,5.7,5.8,12.0,10.7,8.5
Month,2.3,4.6,7.4,9.3,5.9
Range 60MA,3.6,4.2,7.8,8.0,5.9
DaysSinceLastTradingSession,9.6,6.9,2.8,1.4,5.2
Range 30MA,4.6,4.1,5.1,4.3,4.5
52W H,3.6,3.1,4.1,6.6,4.4
Close 60MA,3.2,2.8,5.1,5.9,4.3
VWAP 60MA,2.2,3.4,5.2,5.8,4.2
Range 15MA,4.4,5.9,2.4,3.9,4.1
52W L,1.8,6.4,2.9,4.2,3.8


## Forecasts

In [33]:
stock_df.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1022,2024-01-19,531.65,529.16,526.49,515.57,490.53
1023,2024-01-20,523.2,520.6,513.65,489.53,480.88
1024,2024-01-23,520.6,517.95,510.76,505.4,473.21
1025,2024-01-24,525.5,522.59,519.87,507.05,483.22
1026,2024-01-25,508.25,505.55,496.83,482.29,454.05
1027,2024-01-29,513.15,509.95,501.79,487.39,465.5
1028,2024-01-30,510.15,506.27,495.44,478.52,461.95
1029,2024-01-31,519.55,516.26,507.92,492.07,475.77
1030,2024-02-01,501.9,500.32,494.18,473.14,446.33
1031,2024-02-02,494.85,492.0,487.22,466.03,437.97


In [34]:
latest_preds = stock_df.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-02-02
Close: 494.85
Pred Target 3D: 492.0 ± 22.42
Pred Target 7D: 487.22 ± 29.33
Pred Target 15D: 466.03 ± 39.44
Pred Target 30D: 437.97 ± 53.33
