# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['HDFCBANK', 'ITBEES']

In [4]:
STOCK_SYMBOL = stock_symbols[0]
STOCK_SYMBOL

'HDFCBANK'

## Data loading

### Stock data

In [5]:
stock_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-processed.parquet')
)

stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,2020-01-01,1276.10,1280.00,1270.60,1279.00,1278.60,1276.64,2503.3,1084.00,1836849,2.345001e+09,46625,9.40,1,0,0,1,1,2020,2,1,1,1,1278.60,1278.60,1278.60,1278.60,9.40,9.40,9.40,9.40,1276.64,1276.64,1276.64,1276.64,1836849,1836849,1836849,1836849,2345000988,2345000988,2345000988,2345000988,46625,46625,46625,46625,1240.95,1282.70,1240.85,1240.60
1,2020-01-02,1279.00,1288.00,1279.00,1286.00,1286.75,1284.56,2503.3,1084.00,3068583,3.941792e+09,104570,9.00,1,0,0,2,1,2020,3,2,1,1,1282.68,1282.68,1282.68,1282.68,9.20,9.20,9.20,9.20,1280.60,1280.60,1280.60,1280.60,2452716,2452716,2452716,2452716,3143396262,3143396262,3143396262,3143396262,75597,75597,75597,75597,1260.60,1286.00,1244.85,1249.00
2,2020-01-03,1282.20,1285.00,1263.60,1268.50,1268.40,1270.48,2503.3,1084.00,5427775,6.895886e+09,157066,21.40,0,0,0,3,1,2020,4,3,1,1,1277.92,1277.92,1277.92,1277.92,13.27,13.27,13.27,13.27,1277.23,1277.23,1277.23,1277.23,3444402,3444402,3444402,3444402,4394226092,4394226092,4394226092,4394226092,102753,102753,102753,102753,1257.30,1289.50,1244.55,1241.40
3,2020-01-06,1260.00,1261.80,1236.00,1240.25,1240.95,1247.24,2503.3,1084.00,5445093,6.791348e+09,155007,25.80,0,0,0,6,1,2020,0,6,1,3,1268.68,1268.68,1268.68,1268.68,16.40,16.40,16.40,16.40,1269.73,1269.73,1269.73,1269.73,3944575,3944575,3944575,3944575,4993506527,4993506527,4993506527,4993506527,115817,115817,115817,115817,1271.40,1284.25,1213.20,1219.35
4,2020-01-07,1258.90,1271.45,1252.25,1261.00,1260.60,1261.48,2503.3,1084.00,7362247,9.287302e+09,189026,19.20,1,0,0,7,1,2020,1,7,1,1,1267.06,1267.06,1267.06,1267.06,16.96,16.96,16.96,16.96,1268.08,1268.08,1268.08,1268.08,4628109,4628109,4628109,4628109,5852265530,5852265530,5852265530,5852265530,130458,130458,130458,130458,1282.70,1287.65,1223.20,1217.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,2024-01-16,1673.00,1683.65,1658.10,1678.00,1679.15,1672.44,1757.5,1460.25,12661250,2.117514e+10,347404,25.55,1,0,0,16,1,2024,1,16,1,1,1658.86,1678.20,1664.56,1585.82,25.30,24.76,22.80,20.58,1660.48,1678.64,1664.72,1585.57,10797971,12592121,16089685,15332293,17939626497,21173698410,26756826044,24372218817,297593,324871,336519,323849,1470.65,,,
1021,2024-01-17,1570.00,1596.80,1528.40,1542.15,1537.50,1565.65,1757.5,1460.25,85072618,1.331936e+11,2098772,68.40,0,0,0,17,1,2024,2,17,1,1,1640.87,1667.15,1661.69,1586.06,31.88,27.46,24.33,21.35,1645.31,1669.96,1662.71,1586.34,21665597,17363326,18054925,16574386,34817957164,28526480964,29781095464,26325036311,544785,443365,393241,354372,1478.85,,,
1022,2024-01-18,1494.00,1515.00,1480.05,1490.00,1486.15,1495.03,1757.5,1460.25,80535465,1.204027e+11,1582497,34.95,0,0,0,18,1,2024,3,18,1,1,1617.39,1652.54,1656.97,1585.73,32.44,28.49,24.77,21.46,1621.62,1655.54,1658.30,1585.93,31361600,21263141,20095890,17730176,49013774376,34039123030,32747198969,28048301982,732341,520660,434624,376597,,,,
1023,2024-01-19,1505.95,1510.25,1468.40,1474.90,1470.65,1483.88,1757.5,1460.25,54800269,8.131686e+10,1275220,41.85,0,0,0,19,1,2024,4,19,1,1,1590.92,1636.63,1651.64,1585.30,35.76,30.02,25.57,21.63,1597.43,1640.64,1653.48,1585.56,38039080,24082779,21581725,18407581,58727408569,38036670502,34902710598,29048300599,889730,588066,469801,391650,,,,


### Standardized data

In [6]:
standardized_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-standardized.parquet')
)

standardized_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.998,1.001,0.994,1.000,0.998,1.958,0.848,1,0,0,1,1,2020,2,1,1,1,1.000,1.000,1.000,1.000,0.007,0.007,0.007,0.007,0.998,0.998,0.998,0.998,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.971,1.003,0.970,0.970
1,0.994,1.001,0.994,0.999,0.998,1.945,0.842,1,0,0,2,1,2020,3,2,1,1,0.997,0.997,0.997,0.997,0.007,0.007,0.007,0.007,0.995,0.995,0.995,0.995,0.799,0.799,0.799,0.799,0.797,0.797,0.797,0.797,0.723,0.723,0.723,0.723,0.980,0.999,0.967,0.971
2,1.011,1.013,0.996,1.000,1.002,1.974,0.855,0,0,0,3,1,2020,4,3,1,1,1.008,1.008,1.008,1.008,0.010,0.010,0.010,0.010,1.007,1.007,1.007,1.007,0.635,0.635,0.635,0.635,0.637,0.637,0.637,0.637,0.654,0.654,0.654,0.654,0.991,1.017,0.981,0.979
3,1.015,1.017,0.996,0.999,1.005,2.017,0.874,0,0,0,6,1,2020,0,6,1,3,1.022,1.022,1.022,1.022,0.013,0.013,0.013,0.013,1.023,1.023,1.023,1.023,0.724,0.724,0.724,0.724,0.735,0.735,0.735,0.735,0.747,0.747,0.747,0.747,1.025,1.035,0.978,0.983
4,0.999,1.009,0.993,1.000,1.001,1.986,0.860,1,0,0,7,1,2020,1,7,1,1,1.005,1.005,1.005,1.005,0.013,0.013,0.013,0.013,1.006,1.006,1.006,1.006,0.629,0.629,0.629,0.629,0.630,0.630,0.630,0.630,0.690,0.690,0.690,0.690,1.018,1.021,0.970,0.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.996,1.003,0.987,0.999,0.996,1.047,0.870,1,0,0,16,1,2024,1,16,1,1,0.988,0.999,0.991,0.944,0.015,0.015,0.014,0.012,0.989,1.000,0.991,0.944,0.853,0.995,1.271,1.211,0.847,1.000,1.264,1.151,0.857,0.935,0.969,0.932,0.876,,,
1021,1.021,1.039,0.994,1.003,1.018,1.143,0.950,0,0,0,17,1,2024,2,17,1,1,1.067,1.084,1.081,1.032,0.021,0.018,0.016,0.014,1.070,1.086,1.081,1.032,0.255,0.204,0.212,0.195,0.261,0.214,0.224,0.198,0.260,0.211,0.187,0.169,0.962,,,
1022,1.005,1.019,0.996,1.003,1.006,1.183,0.983,0,0,0,18,1,2024,3,18,1,1,1.088,1.112,1.115,1.067,0.022,0.019,0.017,0.014,1.091,1.114,1.116,1.067,0.389,0.264,0.250,0.220,0.407,0.283,0.272,0.233,0.463,0.329,0.275,0.238,,,,
1023,1.024,1.027,0.998,1.003,1.009,1.195,0.993,0,0,0,19,1,2024,4,19,1,1,1.082,1.113,1.123,1.078,0.024,0.020,0.017,0.015,1.086,1.116,1.124,1.078,0.694,0.439,0.394,0.336,0.722,0.468,0.429,0.357,0.698,0.461,0.368,0.307,,,,


## Modelling

### Target columns

In [7]:
target_cols = standardized_df.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [8]:
standardized_df[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1022.0,1018.0,1010.0,995.0
mean,1.000904,1.002511,1.005721,1.012089
std,0.031153,0.046105,0.064164,0.086329
min,0.82,0.718,0.668,0.629
25%,0.985,0.978,0.968,0.963
50%,1.0,1.002,1.003,1.009
75%,1.01675,1.026,1.043,1.06
max,1.178,1.219,1.231,1.322


### Data processing

In [9]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = standardized_df[standardized_df[target_col].notnull()].drop(columns = target_cols).copy()
    y = standardized_df[standardized_df[target_col].notnull()][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [10]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = standardized_df.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [11]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 10,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [12]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1022, 41)
y.shape: (1022,)


In [13]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [14]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.370955,0.01742,0.045594,0.001901,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.029775,-0.016731,-0.025116,0.002177,0.018198,-0.010249,0.017933,1
8,0.368575,0.022287,0.046923,0.003967,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.027244,-0.024496,-0.02378,-0.001474,0.012458,-0.012907,0.015698,2
0,0.54119,0.047691,0.06404,0.005567,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.037833,-0.030381,-0.030807,-0.003433,0.015476,-0.017395,0.020207,3
4,0.313265,0.028186,0.041903,0.005304,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.046225,-0.023832,-0.028429,-0.010089,0.009387,-0.019838,0.018629,4
5,0.562382,0.05641,0.066554,0.013919,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.028307,-0.027365,-0.046113,-0.006985,0.00908,-0.019938,0.01908,5


In [15]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.031
R2: 0.207
MSE: 0.028
MAE: 0.020


In [16]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1012,2024-01-04,1690.85,1650.5,1685.01
1013,2024-01-05,1682.2,1655.95,1674.88
1014,2024-01-08,1663.45,1649.0,1654.91
1015,2024-01-09,1650.5,1641.2,1644.65
1016,2024-01-10,1655.95,1672.8,1648.96
1017,2024-01-11,1649.0,1679.15,1643.94
1018,2024-01-12,1641.2,1537.5,1635.47
1019,2024-01-15,1672.8,1486.15,1660.21
1020,2024-01-16,1679.15,1470.65,1668.12
1021,2024-01-17,1537.5,1478.85,1519.49


### `Target 7D`

In [17]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1018, 41)
y.shape: (1018,)


In [18]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [19]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.391771,0.023002,0.045184,0.003455,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.04413,-0.077612,-0.149349,0.009914,-0.031395,-0.058515,0.053368,1
8,0.406806,0.019437,0.052271,0.006312,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.070306,-0.069298,-0.128129,0.000695,-0.032475,-0.059903,0.043071,2
3,0.279092,0.01781,0.040632,0.003763,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.077212,-0.088351,-0.169286,0.022966,-0.015896,-0.065556,0.065906,3
4,0.379976,0.054883,0.049706,0.009212,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.093062,-0.096707,-0.138941,0.004018,-0.066147,-0.078168,0.047239,4
1,0.324952,0.022019,0.044632,0.000953,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.104088,-0.101219,-0.146563,0.010935,-0.050445,-0.078276,0.054017,5


In [20]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.046
R2: 0.275
MSE: 0.039
MAE: 0.029


In [21]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1008,2023-12-29,1709.25,1650.5,1714.72
1009,2024-01-01,1698.1,1655.95,1684.87
1010,2024-01-02,1699.1,1649.0,1689.64
1011,2024-01-03,1672.9,1641.2,1663.53
1012,2024-01-04,1690.85,1672.8,1682.5
1013,2024-01-05,1682.2,1679.15,1674.77
1014,2024-01-08,1663.45,1537.5,1644.3
1015,2024-01-09,1650.5,1486.15,1635.05
1016,2024-01-10,1655.95,1470.65,1634.63
1017,2024-01-11,1649.0,1478.85,1629.75


### `Target 15D`

In [22]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1010, 41)
y.shape: (1010,)


In [23]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [24]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.380012,0.032854,0.045434,0.001819,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.149794,-0.126462,-0.227065,0.019661,-0.296189,-0.15597,0.106274,1
8,0.382554,0.052639,0.05127,0.005999,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.140925,-0.172908,-0.251921,0.031193,-0.24998,-0.156908,0.103521,2
5,0.507095,0.02772,0.058401,0.005171,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.196576,-0.183304,-0.287463,-0.009229,-0.258648,-0.187044,0.096886,3
1,0.329078,0.011327,0.045562,0.002065,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.181165,-0.17382,-0.308994,-0.007704,-0.275462,-0.189429,0.104913,4
4,0.297595,0.014836,0.044537,0.005777,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.20313,-0.167687,-0.311213,0.01352,-0.32395,-0.198492,0.121955,5


In [25]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.064
R2: 0.394
MSE: 0.050
MAE: 0.039


In [26]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1000,2023-12-18,1655.7,1650.5,1664.29
1001,2023-12-19,1652.9,1655.95,1659.89
1002,2023-12-20,1657.0,1649.0,1662.36
1003,2023-12-21,1686.7,1641.2,1691.51
1004,2023-12-22,1670.85,1672.8,1676.92
1005,2023-12-26,1682.45,1679.15,1690.65
1006,2023-12-27,1703.3,1537.5,1708.89
1007,2023-12-28,1705.25,1486.15,1712.2
1008,2023-12-29,1709.25,1470.65,1715.4
1009,2024-01-01,1698.1,1478.85,1673.27


### `Target 30D`

In [27]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (995, 41)
y.shape: (995,)


In [28]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 1.0,
 'max_features': 'sqrt',
 'max_depth': 4}

In [29]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.372271,0.013669,0.045414,0.001899,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.354177,-0.096145,-0.643364,0.038969,-0.321205,-0.275184,0.234376,1
9,0.350192,0.012828,0.048206,0.003916,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.298608,-0.072442,-0.696006,0.002033,-0.359565,-0.284917,0.245862,2
4,0.303377,0.03729,0.043841,0.005868,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.368166,-0.075456,-0.788485,0.017922,-0.374612,-0.317759,0.2824,3
0,0.495822,0.083831,0.063319,0.019661,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.30418,-0.099664,-0.871815,-0.013129,-0.361173,-0.329992,0.299554,4
5,0.487665,0.027817,0.056943,0.004579,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.443828,-0.101359,-0.8707,0.040003,-0.463088,-0.367794,0.317825,5


In [30]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.086
R2: 0.560
MSE: 0.057
MAE: 0.045


In [31]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
985,2023-11-24,1532.1,1650.5,1573.98
986,2023-11-28,1528.65,1655.95,1576.27
987,2023-11-29,1559.15,1649.0,1592.13
988,2023-11-30,1558.8,1641.2,1578.88
989,2023-12-01,1555.4,1672.8,1586.4
990,2023-12-04,1609.4,1679.15,1622.73
991,2023-12-05,1623.7,1537.5,1632.21
992,2023-12-06,1627.8,1486.15,1635.71
993,2023-12-07,1630.45,1470.65,1640.16
994,2023-12-08,1653.2,1478.85,1665.37


## Feature importances

In [32]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
52W H,7.0,6.4,9.0,10.4,8.2
Range 60MA,5.1,10.3,5.1,9.1,7.4
DayOfYear,2.4,5.0,7.6,12.9,7.0
Month,1.7,2.7,6.4,9.1,5.0
DaysSinceLastTradingSession,0.1,0.4,7.2,9.8,4.4
Range 30MA,2.2,3.3,3.6,5.3,3.6
Range 7MA,3.3,5.0,3.0,2.3,3.4
#Trades 60MA,3.0,3.6,5.2,1.9,3.4
VWAP 60MA,4.5,2.6,3.9,2.2,3.3
Close 15MA,6.6,2.8,2.5,1.1,3.2


## Forecasts

In [33]:
stock_df.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1015,2024-01-09,1650.5,1644.65,1635.05,1634.33,1613.72
1016,2024-01-10,1655.95,1648.96,1634.63,1640.57,1625.4
1017,2024-01-11,1649.0,1643.94,1629.75,1633.27,1616.87
1018,2024-01-12,1641.2,1635.47,1623.65,1627.85,1600.76
1019,2024-01-15,1672.8,1660.21,1661.33,1658.7,1644.0
1020,2024-01-16,1679.15,1668.12,1667.41,1663.33,1648.26
1021,2024-01-17,1537.5,1519.49,1474.23,1486.26,1492.15
1022,2024-01-18,1486.15,1451.94,1406.97,1433.03,1460.29
1023,2024-01-19,1470.65,1440.31,1431.58,1406.34,1449.48
1024,2024-01-20,1478.85,1484.74,1493.51,1478.8,1479.15


In [34]:
latest_preds = stock_df.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-01-20
Close: 1478.85
Pred Target 3D: 1484.74 ± 41.00
Pred Target 7D: 1493.51 ± 58.03
Pred Target 15D: 1478.8 ± 73.85
Pred Target 30D: 1479.15 ± 84.60
