# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
SCREENER_EPS_DATA_DIR = PosixPath('../data/Screener.in/EPS') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD']

In [4]:
STOCK_SYMBOL = stock_symbols[1]
STOCK_SYMBOL

'HDFCBANK'

## Loading stock data

In [5]:
stock_data = su.StockData(STOCK_SYMBOL)
stock_data

Symbol: HDFCBANK
Total records: 1049
First record: 2020-01-01
Last record: 2024-02-26

## Modelling

### Target columns

In [6]:
target_cols = stock_data.standardized.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [7]:
stock_data.standardized[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1046.0,1042.0,1034.0,1019.0
mean,1.000772,1.001874,1.003761,1.008427
std,0.030943,0.046347,0.065198,0.08858
min,0.82,0.718,0.668,0.629
25%,0.985,0.977,0.967,0.9595
50%,1.0,1.002,1.0015,1.006
75%,1.01675,1.025,1.042,1.058
max,1.178,1.219,1.231,1.322


### Data processing

In [8]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ].drop(columns = target_cols).copy()
    y = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [9]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = stock_data.standardized.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [10]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [11]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1046, 43)
y.shape: (1046,)


In [12]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 75, 'max_samples': 1.0, 'max_features': 0.25, 'max_depth': 4}

In [13]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.213845,0.012613,0.031345,0.002105,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.03762,-0.0244,-0.00742,-0.03264,-0.024807,-0.025377,0.01026,1
11,0.263777,0.00278,0.04068,0.007036,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.040361,-0.020075,-0.021294,-0.036158,-0.02488,-0.028553,0.008189,2
8,0.27187,0.020119,0.04397,0.005175,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.045263,-0.014727,-0.015395,-0.036394,-0.039504,-0.030257,0.012731,3
9,0.262344,0.002359,0.040962,0.001338,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.034815,-0.02499,-0.021937,-0.047532,-0.024327,-0.03072,0.00949,4
0,0.353297,0.048518,0.037397,0.00544,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.046526,-0.02232,-0.022712,-0.047768,-0.028542,-0.033574,0.011306,5


In [14]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.031
R2: 0.220
MSE: 0.027
MAE: 0.020


In [15]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1036,2024-02-08,1403.05,1394.45,1404.99
1037,2024-02-09,1403.6,1384.05,1405.42
1038,2024-02-12,1390.0,1414.05,1397.46
1039,2024-02-13,1394.45,1419.9,1400.51
1040,2024-02-14,1384.05,1417.1,1390.59
1041,2024-02-15,1414.05,1454.3,1413.9
1042,2024-02-16,1419.9,1439.15,1417.67
1043,2024-02-19,1417.1,1419.55,1414.87
1044,2024-02-20,1454.3,1420.6,1452.97
1045,2024-02-21,1439.15,1422.3,1437.37


### `Target 7D`

In [16]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1042, 43)
y.shape: (1042,)


In [17]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [18]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.314068,0.011068,0.044109,0.001671,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.021028,-0.075999,-0.046217,-0.1193,-0.046708,-0.06185,0.033588,1
11,0.296377,0.017627,0.043978,0.001168,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.051373,-0.101307,-0.056445,-0.104872,-0.063652,-0.07553,0.022866,2
8,0.347106,0.018807,0.04271,0.000586,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.027042,-0.102422,-0.076779,-0.131414,-0.066854,-0.080902,0.035014,3
4,0.265757,0.026446,0.034689,0.005876,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.064413,-0.11044,-0.076016,-0.120589,-0.054299,-0.085152,0.025926,4
3,0.226297,0.022795,0.035369,0.006966,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.027189,-0.149166,-0.046587,-0.141099,-0.072156,-0.087239,0.049441,5


In [19]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.046
R2: 0.285
MSE: 0.039
MAE: 0.029


In [20]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1032,2024-02-02,1446.15,1394.45,1440.34
1033,2024-02-05,1444.85,1384.05,1433.36
1034,2024-02-06,1444.1,1414.05,1432.89
1035,2024-02-07,1429.95,1419.9,1426.68
1036,2024-02-08,1403.05,1417.1,1408.11
1037,2024-02-09,1403.6,1454.3,1407.89
1038,2024-02-12,1390.0,1439.15,1391.19
1039,2024-02-13,1394.45,1419.55,1398.94
1040,2024-02-14,1384.05,1420.6,1393.02
1041,2024-02-15,1414.05,1422.3,1407.84


### `Target 15D`

In [21]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1034, 43)
y.shape: (1034,)


In [22]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 1.0,
 'max_features': 'sqrt',
 'max_depth': 4}

In [23]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.309437,0.026857,0.041742,0.000683,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.040532,-0.197437,-0.285222,-0.311817,-0.158178,-0.182424,0.124766,1
0,0.382564,0.070662,0.047246,0.009939,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.014874,-0.215506,-0.277654,-0.281439,-0.191014,-0.190148,0.108323,2
9,0.286574,0.002462,0.043276,0.001373,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.031235,-0.241635,-0.247271,-0.352181,-0.175167,-0.197004,0.127412,3
3,0.224104,0.003142,0.03121,0.001632,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.006445,-0.260519,-0.268535,-0.347639,-0.207788,-0.215607,0.119682,4
4,0.246008,0.012191,0.033298,0.002309,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",0.02637,-0.305454,-0.274615,-0.379733,-0.226326,-0.231952,0.138437,5


In [24]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.065
R2: 0.436
MSE: 0.049
MAE: 0.038


In [25]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1024,2024-01-20,1478.85,1394.45,1456.39
1025,2024-01-23,1427.35,1384.05,1389.6
1026,2024-01-24,1455.9,1414.05,1429.46
1027,2024-01-25,1434.9,1419.9,1414.65
1028,2024-01-29,1454.65,1417.1,1437.05
1029,2024-01-30,1444.3,1454.3,1425.68
1030,2024-01-31,1462.55,1439.15,1444.31
1031,2024-02-01,1466.35,1419.55,1444.75
1032,2024-02-02,1446.15,1420.6,1426.7
1033,2024-02-05,1444.85,1422.3,1421.56


### `Target 30D`

In [26]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (1019, 43)
y.shape: (1019,)


In [27]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [28]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.30034,0.024595,0.041821,0.00052,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.018913,-0.207718,-0.637816,-0.363467,-0.219846,-0.289552,0.205695,1
8,0.287435,0.003303,0.043354,0.001848,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.031488,-0.293939,-0.674412,-0.374733,-0.174264,-0.297172,0.233006,2
4,0.254899,0.008807,0.031196,0.000623,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.007036,-0.156361,-0.737013,-0.403118,-0.193908,-0.299487,0.252716,3
0,0.362978,0.074204,0.047416,0.007967,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.01758,-0.253744,-0.770939,-0.344493,-0.201081,-0.317567,0.250561,4
11,0.306517,0.019499,0.041355,0.002984,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.003033,-0.296588,-0.818814,-0.277989,-0.206912,-0.319454,0.271391,5


In [29]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.089
R2: 0.545
MSE: 0.060
MAE: 0.047


In [30]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
1009,2024-01-01,1698.1,1394.45,1583.27
1010,2024-01-02,1699.1,1384.05,1584.76
1011,2024-01-03,1672.9,1414.05,1562.8
1012,2024-01-04,1690.85,1419.9,1580.44
1013,2024-01-05,1682.2,1417.1,1579.23
1014,2024-01-08,1663.45,1454.3,1567.54
1015,2024-01-09,1650.5,1439.15,1554.65
1016,2024-01-10,1655.95,1419.55,1557.94
1017,2024-01-11,1649.0,1420.6,1552.8
1018,2024-01-12,1641.2,1422.3,1546.11


## Feature importances

In [31]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
52W H,13.1,7.6,8.1,9.0,9.5
Range 60MA,4.3,9.9,6.1,8.9,7.3
DayOfYear,2.8,5.2,9.7,9.9,6.9
PE,4.3,6.0,5.3,10.5,6.5
Month,0.6,3.0,6.7,6.5,4.2
DaysSinceLastTradingSession,0.1,0.4,8.4,7.5,4.1
Close 60MA,6.6,3.6,3.2,2.2,3.9
VWAP 60MA,5.2,3.0,3.1,2.2,3.4
52W L,0.7,2.6,5.3,4.8,3.3
Range 7MA,2.7,5.1,3.2,1.8,3.2


## Forecasts

In [32]:
stock_data.processed.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1039,2024-02-13,1394.45,1400.51,1398.94,1371.33,1385.14
1040,2024-02-14,1384.05,1390.59,1393.02,1367.27,1364.6
1041,2024-02-15,1414.05,1413.9,1407.84,1387.91,1404.11
1042,2024-02-16,1419.9,1417.67,1411.66,1392.79,1412.51
1043,2024-02-19,1417.1,1414.87,1411.84,1389.13,1408.57
1044,2024-02-20,1454.3,1452.97,1446.76,1423.22,1433.83
1045,2024-02-21,1439.15,1437.37,1432.88,1409.56,1428.34
1046,2024-02-22,1419.55,1418.07,1426.87,1394.43,1410.04
1047,2024-02-23,1420.6,1418.55,1418.62,1392.32,1414.98
1048,2024-02-26,1422.3,1421.12,1422.61,1393.7,1413.89


In [33]:
latest_preds = stock_data.processed.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-02-26
Close: 1422.3
Pred Target 3D: 1421.12 ± 38.84
Pred Target 7D: 1422.61 ± 55.71
Pred Target 15D: 1393.7 ± 69.63
Pred Target 30D: 1413.89 ± 84.92
