# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD']

In [4]:
STOCK_SYMBOL = stock_symbols[2]
STOCK_SYMBOL

'INDIGOPNTS'

## Data loading

### Stock data

In [5]:
stock_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-processed.parquet')
)

stock_df

Unnamed: 0,Date,Open,High,Low,LTP,Close,VWAP,52W H,52W L,Volume,Value,#Trades,Range,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,2021-02-02,2607.50,3129.00,2436.05,3129.00,3117.15,2684.35,3129.00,2436.05,12652036,3.396245e+10,812580,692.95,1,1,1,2,2,2021,1,33,1,1,3117.15,3117.15,3117.15,3117.15,692.95,692.95,692.95,692.95,2684.35,2684.35,2684.35,2684.35,12652036,12652036,12652036,12652036,33962446981,33962446981,33962446981,33962446981,812580,812580,812580,812580,2630.05,2649.40,2510.95,2367.80
1,2021-02-03,3239.00,3329.95,2831.25,2953.45,2924.25,3085.39,3329.95,2436.05,2955005,9.117329e+09,375333,498.70,0,0,1,3,2,2021,2,34,1,1,3020.70,3020.70,3020.70,3020.70,595.82,595.82,595.82,595.82,2884.87,2884.87,2884.87,2884.87,7803520,7803520,7803520,7803520,21539888009,21539888009,21539888009,21539888009,593956,593956,593956,593956,2677.05,2610.75,2487.95,2305.80
2,2021-02-04,2948.85,2967.95,2853.00,2866.00,2873.75,2902.67,3329.95,2436.05,476054,1.381829e+09,73464,114.95,0,0,0,4,2,2021,3,35,1,1,2971.72,2971.72,2971.72,2971.72,435.53,435.53,435.53,435.53,2890.80,2890.80,2890.80,2890.80,5361031,5361031,5361031,5361031,14820534938,14820534938,14820534938,14820534938,420459,420459,420459,420459,2704.85,2595.65,2499.70,2285.45
3,2021-02-05,2866.00,2900.00,2603.00,2621.00,2630.05,2694.77,3329.95,2436.05,587315,1.582680e+09,103675,297.00,0,0,0,5,2,2021,4,36,1,1,2886.30,2886.30,2886.30,2886.30,400.90,400.90,400.90,400.90,2841.80,2841.80,2841.80,2841.80,4167602,4167602,4167602,4167602,11511071310,11511071310,11511071310,11511071310,341263,341263,341263,341263,2666.25,2564.55,2522.20,2378.50
4,2021-02-08,2590.00,2736.00,2551.00,2700.00,2677.05,2666.44,3329.95,2436.05,507665,1.353659e+09,55033,185.00,1,0,0,8,2,2021,0,39,1,3,2844.45,2844.45,2844.45,2844.45,357.72,357.72,357.72,357.72,2806.72,2806.72,2806.72,2806.72,3435615,3435615,3435615,3435615,9479588872,9479588872,9479588872,9479588872,284017,284017,284017,284017,2649.40,2642.50,2506.15,2367.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,2024-02-19,1517.00,1517.00,1495.10,1512.85,1511.80,1507.80,1679.95,981.95,59787,9.014660e+07,7776,21.90,0,0,0,19,2,2024,0,50,1,3,1473.09,1449.51,1460.62,1472.00,55.49,47.82,45.23,36.15,1482.80,1455.74,1463.98,1474.14,229740,141834,134431,98886,338786942,207849552,197851208,145956748,24460,16612,13663,10986,1451.05,,,
756,2024-02-20,1521.00,1521.00,1492.25,1503.00,1503.50,1503.27,1679.95,981.95,48951,7.358659e+07,6424,28.75,0,0,0,20,2,2024,1,51,1,1,1484.54,1453.20,1461.24,1472.52,43.88,46.74,45.16,36.13,1490.49,1459.14,1464.43,1474.61,170199,142950,134887,98819,252862455,209635951,198552770,145880554,18176,16698,13675,10962,1443.50,,,
757,2024-02-21,1505.00,1515.95,1475.50,1490.10,1492.05,1496.00,1679.95,981.95,66167,9.898576e+07,9274,40.45,0,0,0,21,2,2024,2,52,1,1,1490.64,1456.44,1461.30,1472.68,40.30,47.90,45.71,36.48,1493.76,1462.89,1464.66,1474.92,104563,144848,135534,99479,156387430,212617675,199532276,146875939,12797,16863,13722,11043,,,,
758,2024-02-22,1487.00,1488.90,1446.05,1451.00,1451.05,1460.34,1679.95,981.95,98408,1.437093e+08,14285,42.85,0,0,0,22,2,2024,3,53,1,1,1487.30,1458.40,1460.31,1471.87,39.04,47.95,46.40,36.79,1491.52,1464.93,1464.15,1474.39,94564,148361,137455,100396,141408559,217840828,202316250,148192613,11945,17202,13916,11152,,,,


### Standardized data

In [6]:
standardized_df = pd.read_parquet(
    cnst.PROCESSED_DATA_DIR.joinpath(f'{STOCK_SYMBOL}-standardized.parquet')
)

standardized_df

Unnamed: 0,Open,High,Low,LTP,VWAP,52W H,52W L,IsGreen,Is52WLow,Is52WHigh,Day,Month,Year,Weekday,DayOfYear,Quarter,DaysSinceLastTradingSession,Close 7MA,Close 15MA,Close 30MA,Close 60MA,Range 7MA,Range 15MA,Range 30MA,Range 60MA,VWAP 7MA,VWAP 15MA,VWAP 30MA,VWAP 60MA,Volume 7MA,Volume 15MA,Volume 30MA,Volume 60MA,Value 7MA,Value 15MA,Value 30MA,Value 60MA,#Trades 7MA,#Trades 15MA,#Trades 30MA,#Trades 60MA,Target 3D,Target 7D,Target 15D,Target 30D
0,0.837,1.004,0.781,1.004,0.861,1.004,0.781,1,1,1,2,2,2021,1,33,1,1,1.000,1.000,1.000,1.000,0.222,0.222,0.222,0.222,0.861,0.861,0.861,0.861,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,1.000,0.844,0.850,0.806,0.760
1,1.108,1.139,0.968,1.010,1.055,1.139,0.833,0,0,1,3,2,2021,2,34,1,1,1.033,1.033,1.033,1.033,0.204,0.204,0.204,0.204,0.987,0.987,0.987,0.987,2.641,2.641,2.641,2.641,2.363,2.363,2.363,2.363,1.582,1.582,1.582,1.582,0.915,0.893,0.851,0.789
2,1.026,1.033,0.993,0.997,1.010,1.159,0.848,0,0,0,4,2,2021,3,35,1,1,1.034,1.034,1.034,1.034,0.152,0.152,0.152,0.152,1.006,1.006,1.006,1.006,11.261,11.261,11.261,11.261,10.725,10.725,10.725,10.725,5.723,5.723,5.723,5.723,0.941,0.903,0.870,0.795
3,1.090,1.103,0.990,0.997,1.025,1.266,0.926,0,0,0,5,2,2021,4,36,1,1,1.097,1.097,1.097,1.097,0.152,0.152,0.152,0.152,1.081,1.081,1.081,1.081,7.096,7.096,7.096,7.096,7.273,7.273,7.273,7.273,3.292,3.292,3.292,3.292,1.014,0.975,0.959,0.904
4,0.967,1.022,0.953,1.009,0.996,1.244,0.910,1,0,0,8,2,2021,0,39,1,3,1.063,1.063,1.063,1.063,0.134,0.134,0.134,0.134,1.048,1.048,1.048,1.048,6.767,6.767,6.767,6.767,7.003,7.003,7.003,7.003,5.161,5.161,5.161,5.161,0.990,0.987,0.936,0.884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,1.003,1.003,0.989,1.001,0.997,1.111,0.650,0,0,0,19,2,2024,0,50,1,3,0.974,0.959,0.966,0.974,0.037,0.032,0.030,0.024,0.981,0.963,0.968,0.975,3.843,2.372,2.248,1.654,3.758,2.306,2.195,1.619,3.146,2.136,1.757,1.413,0.960,,,
756,1.012,1.012,0.993,1.000,1.000,1.117,0.653,0,0,0,20,2,2024,1,51,1,1,0.987,0.967,0.972,0.979,0.029,0.031,0.030,0.024,0.991,0.970,0.974,0.981,3.477,2.920,2.756,2.019,3.436,2.849,2.698,1.982,2.829,2.599,2.129,1.706,0.960,,,
757,1.009,1.016,0.989,0.999,1.003,1.126,0.658,0,0,0,21,2,2024,2,52,1,1,0.999,0.976,0.979,0.987,0.027,0.032,0.031,0.024,1.001,0.980,0.982,0.989,1.580,2.189,2.048,1.503,1.580,2.148,2.016,1.484,1.380,1.818,1.480,1.191,,,,
758,1.025,1.026,0.997,1.000,1.006,1.158,0.677,0,0,0,22,2,2024,3,53,1,1,1.025,1.005,1.006,1.014,0.027,0.033,0.032,0.025,1.028,1.010,1.009,1.016,0.961,1.508,1.397,1.020,0.984,1.516,1.408,1.031,0.836,1.204,0.974,0.781,,,,


## Modelling

### Target columns

In [7]:
target_cols = standardized_df.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [8]:
standardized_df[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,757.0,753.0,745.0,730.0
mean,0.997703,0.995247,0.990046,0.981677
std,0.032368,0.046782,0.06778,0.10407
min,0.844,0.831,0.786,0.741
25%,0.981,0.967,0.949,0.913
50%,0.996,0.99,0.982,0.973
75%,1.012,1.018,1.026,1.03
max,1.13,1.206,1.246,1.399


### Data processing

In [9]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = standardized_df[standardized_df[target_col].notnull()].drop(columns = target_cols).copy()
    y = standardized_df[standardized_df[target_col].notnull()][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [10]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = standardized_df.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [11]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [12]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (757, 41)
y.shape: (757,)


In [13]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 1.0,
 'max_features': 'sqrt',
 'max_depth': 4}

In [14]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.378822,0.054787,0.055981,0.024287,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.036915,-0.032029,-0.000469,-0.126141,-0.068924,-0.03813,0.056181,1
11,0.34579,0.059729,0.046594,0.006061,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.032593,-0.029613,0.001659,-0.115657,-0.09173,-0.04055,0.055699,2
4,0.271293,0.066909,0.038537,0.007707,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",0.036704,-0.011821,0.015673,-0.151257,-0.113562,-0.044853,0.074093,3
9,0.318828,0.019843,0.044305,0.002383,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.023905,-0.014992,0.009926,-0.150091,-0.10075,-0.0464,0.067539,4
3,0.221184,0.010192,0.030083,0.000571,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.048832,-0.03482,0.009431,-0.160746,-0.097643,-0.046989,0.074859,5


In [15]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.032
R2: 0.237
MSE: 0.028
MAE: 0.020


In [16]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
747,2024-02-07,1436.45,1449.4,1427.09
748,2024-02-08,1442.3,1474.4,1431.9
749,2024-02-09,1423.3,1458.4,1414.29
750,2024-02-12,1449.4,1499.6,1439.41
751,2024-02-13,1474.4,1494.7,1463.85
752,2024-02-14,1458.4,1511.8,1446.84
753,2024-02-15,1499.6,1503.5,1487.35
754,2024-02-16,1494.7,1492.05,1484.24
755,2024-02-19,1511.8,1451.05,1500.46
756,2024-02-20,1503.5,1443.5,1491.73


### `Target 7D`

In [17]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (753, 41)
y.shape: (753,)


In [18]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [19]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.328777,0.011958,0.044084,0.001425,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.109439,-0.127337,-0.064514,-0.25456,-0.399068,-0.190984,0.121685,1
0,0.395085,0.066212,0.049211,0.00981,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.12361,-0.082019,-0.061839,-0.290847,-0.43793,-0.199249,0.144017,2
11,0.351291,0.025982,0.044139,0.00103,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.117564,-0.091178,-0.093344,-0.205603,-0.500835,-0.201705,0.155262,3
5,0.50286,0.052936,0.066833,0.022065,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.131279,-0.081137,-0.045449,-0.328942,-0.474572,-0.212276,0.163707,4
10,0.475096,0.048244,0.054862,0.001836,125,1.0,log2,6,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.072386,-0.115927,-0.062371,-0.331915,-0.494887,-0.215497,0.170561,5


In [20]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.047
R2: 0.334
MSE: 0.038
MAE: 0.029


In [21]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
743,2024-02-01,1421.6,1449.4,1409.39
744,2024-02-02,1418.9,1474.4,1405.1
745,2024-02-05,1413.15,1458.4,1400.67
746,2024-02-06,1407.05,1499.6,1396.3
747,2024-02-07,1436.45,1494.7,1419.81
748,2024-02-08,1442.3,1511.8,1426.53
749,2024-02-09,1423.3,1503.5,1408.17
750,2024-02-12,1449.4,1492.05,1429.56
751,2024-02-13,1474.4,1451.05,1452.43
752,2024-02-14,1458.4,1443.5,1438.91


### `Target 15D`

In [22]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (745, 41)
y.shape: (745,)


In [23]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 0.25,
 'max_depth': 4}

In [24]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.318568,0.038576,0.042939,0.000938,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.171689,-0.196879,-0.12486,0.126647,-0.408724,-0.086425,0.214278,1
2,0.318065,0.013869,0.043056,0.00124,100,1.0,0.25,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.173685,-0.156256,-0.043322,0.117885,-0.55881,-0.093363,0.2604,2
3,0.242236,0.022691,0.032492,0.001572,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.157957,-0.20722,-0.104864,0.127326,-0.456851,-0.09673,0.226728,3
0,0.396848,0.057809,0.052304,0.00902,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.098867,-0.280255,-0.126063,0.105203,-0.405006,-0.121451,0.202759,4
8,0.299585,0.01057,0.042813,0.001677,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.112519,-0.294472,-0.183575,0.047488,-0.330737,-0.129756,0.179183,5


In [25]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.068
R2: 0.569
MSE: 0.044
MAE: 0.034


In [26]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
735,2024-01-19,1469.25,1449.4,1420.45
736,2024-01-20,1460.5,1474.4,1444.95
737,2024-01-23,1416.0,1458.4,1392.04
738,2024-01-24,1431.1,1499.6,1422.15
739,2024-01-25,1450.0,1494.7,1446.29
740,2024-01-29,1465.4,1511.8,1455.33
741,2024-01-30,1448.15,1503.5,1448.4
742,2024-01-31,1443.45,1492.05,1435.99
743,2024-02-01,1421.6,1451.05,1404.25
744,2024-02-02,1418.9,1443.5,1401.27


### `Target 30D`

In [27]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (730, 41)
y.shape: (730,)


In [28]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100, 'max_samples': 1.0, 'max_features': 0.25, 'max_depth': 5}

In [29]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.367525,0.019824,0.046133,0.006877,100,1.0,0.25,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.044723,-0.97909,-0.296504,0.056091,-0.493821,-0.33372,0.384367,1
11,0.314672,0.018752,0.041463,0.002726,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.009936,-1.080451,-0.2408,0.073017,-0.430185,-0.337671,0.411276,2
6,0.407641,0.021873,0.051479,0.00618,125,0.75,0.25,5,"{'n_estimators': 125, 'max_samples': 0.75, 'ma...",0.113867,-1.02634,-0.243746,0.062128,-0.600047,-0.338828,0.427936,3
7,0.246272,0.012006,0.033036,0.00296,75,0.75,0.25,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",0.125857,-1.128464,-0.258766,0.050975,-0.610655,-0.364211,0.462273,4
3,0.235908,0.010502,0.032041,0.00159,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.050517,-1.221821,-0.314546,0.060289,-0.426579,-0.370428,0.46773,5


In [30]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.104
R2: 0.860
MSE: 0.039
MAE: 0.029


In [31]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_df[pred_col_name] = (
    model.predict(standardized_df.drop(columns = target_cols)) * stock_df['Close']
).round(2)
stock_df[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
720,2023-12-29,1490.5,1449.4,1449.9
721,2024-01-01,1493.2,1474.4,1476.32
722,2024-01-02,1482.15,1458.4,1459.21
723,2024-01-03,1490.35,1499.6,1480.73
724,2024-01-04,1494.8,1494.7,1484.55
725,2024-01-05,1498.75,1511.8,1492.37
726,2024-01-08,1484.9,1503.5,1469.1
727,2024-01-09,1490.1,1492.05,1483.15
728,2024-01-10,1480.65,1451.05,1448.48
729,2024-01-11,1471.5,1443.5,1431.89


## Feature importances

In [32]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
DayOfYear,7.2,11.3,22.4,22.7,15.9
Month,3.1,6.4,15.4,11.0,9.0
52W H,4.5,10.0,5.2,6.3,6.5
Range 60MA,4.6,4.7,5.7,10.6,6.4
Year,0.4,1.8,5.0,11.7,4.7
Close 30MA,4.4,5.4,2.6,3.1,3.9
Range 30MA,2.6,3.4,4.8,4.3,3.8
Range 7MA,6.0,3.8,3.0,1.8,3.7
Close 60MA,5.1,3.6,2.8,2.7,3.6
VWAP 60MA,3.7,4.1,2.8,3.1,3.4


## Forecasts

In [33]:
stock_df.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
750,2024-02-12,1449.4,1439.41,1429.56,1416.72,1390.84
751,2024-02-13,1474.4,1463.85,1452.43,1423.89,1422.36
752,2024-02-14,1458.4,1446.84,1438.91,1445.04,1406.26
753,2024-02-15,1499.6,1487.35,1479.88,1457.85,1483.24
754,2024-02-16,1494.7,1484.24,1476.74,1463.2,1487.97
755,2024-02-19,1511.8,1500.46,1497.74,1497.67,1549.28
756,2024-02-20,1503.5,1491.73,1489.38,1484.41,1558.06
757,2024-02-21,1492.05,1481.49,1474.41,1457.44,1503.28
758,2024-02-22,1451.05,1442.92,1437.14,1429.57,1463.83
759,2024-02-23,1443.5,1436.32,1429.68,1420.5,1455.8


In [34]:
latest_preds = stock_df.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-02-23
Close: 1443.5
Pred Target 3D: 1436.32 ± 40.78
Pred Target 7D: 1429.68 ± 55.09
Pred Target 15D: 1420.5 ± 64.19
Pred Target 30D: 1455.8 ± 56.23
