# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import constants as cnst
import stock_utils as su

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
SCREENER_EPS_DATA_DIR = PosixPath('../data/Screener.in/EPS') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = su.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD']

In [4]:
STOCK_SYMBOL = stock_symbols[1]
STOCK_SYMBOL

'HDFCBANK'

## Loading stock data

In [5]:
stock_data = su.StockData(STOCK_SYMBOL)
stock_data

Symbol: HDFCBANK
Total records: 1049
First record: 2020-01-01
Last record: 2024-02-26

## Modelling

### Target columns

In [6]:
target_cols = stock_data.standardized.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [7]:
stock_data.standardized[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1046.0,1042.0,1034.0,1019.0
mean,1.000772,1.001874,1.003761,1.008427
std,0.030943,0.046347,0.065198,0.08858
min,0.82,0.718,0.668,0.629
25%,0.985,0.977,0.967,0.9595
50%,1.0,1.002,1.0015,1.006
75%,1.01675,1.025,1.042,1.058
max,1.178,1.219,1.231,1.322


### Data processing

In [8]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ].drop(columns = target_cols).copy()
    y = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [9]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = stock_data.standardized.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [10]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [11]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1046, 42)
y.shape: (1046,)


In [12]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 0.25,
 'max_depth': 4}

In [13]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.270165,0.01252,0.039167,0.00457,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.019328,-0.021522,-0.012225,-0.045927,-0.032951,-0.026391,0.011823,1
9,0.267341,0.014625,0.042444,0.001973,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.032314,-0.021108,-0.022319,-0.048712,-0.021999,-0.02929,0.010536,2
4,0.211445,0.011875,0.030124,0.000602,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.034964,-0.027685,-0.009729,-0.051164,-0.032439,-0.031196,0.013324,3
3,0.199006,0.008069,0.030368,0.001574,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.014437,-0.025811,-0.019361,-0.041703,-0.054957,-0.031254,0.015,4
8,0.265937,0.011007,0.051622,0.020499,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.032832,-0.018382,-0.013327,-0.0369,-0.067608,-0.03381,0.019028,5


In [14]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.031
R2: 0.221
MSE: 0.027
MAE: 0.020


In [15]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1036,2024-02-08,1403.05,1394.45,1402.63
1037,2024-02-09,1403.6,1384.05,1402.19
1038,2024-02-12,1390.0,1414.05,1396.76
1039,2024-02-13,1394.45,1419.9,1394.84
1040,2024-02-14,1384.05,1417.1,1387.28
1041,2024-02-15,1414.05,1454.3,1412.28
1042,2024-02-16,1419.9,1439.15,1417.4
1043,2024-02-19,1417.1,1419.55,1414.63
1044,2024-02-20,1454.3,1420.6,1448.95
1045,2024-02-21,1439.15,1422.3,1436.6


### `Target 7D`

In [16]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1042, 42)
y.shape: (1042,)


In [17]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [18]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.296791,0.011041,0.041865,0.00345,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.03293,-0.110657,-0.080167,-0.106163,-0.019988,-0.069981,0.037255,1
8,0.323118,0.035729,0.044191,0.006025,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.034311,-0.118398,-0.078624,-0.103745,-0.035913,-0.074198,0.03436,2
11,0.31655,0.013435,0.042961,0.001197,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.034203,-0.159479,-0.053992,-0.12008,-0.045941,-0.082739,0.04862,3
1,0.291925,0.027797,0.036992,0.005406,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.059681,-0.154528,-0.095193,-0.111438,-0.035882,-0.091344,0.041205,4
4,0.2454,0.00672,0.033081,0.004064,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.040306,-0.164764,-0.105982,-0.131762,-0.020261,-0.092615,0.054567,5


In [19]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.046
R2: 0.287
MSE: 0.039
MAE: 0.029


In [20]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1032,2024-02-02,1446.15,1394.45,1438.56
1033,2024-02-05,1444.85,1384.05,1435.64
1034,2024-02-06,1444.1,1414.05,1432.01
1035,2024-02-07,1429.95,1419.9,1426.21
1036,2024-02-08,1403.05,1417.1,1412.09
1037,2024-02-09,1403.6,1454.3,1412.54
1038,2024-02-12,1390.0,1439.15,1398.11
1039,2024-02-13,1394.45,1419.55,1403.42
1040,2024-02-14,1384.05,1420.6,1386.82
1041,2024-02-15,1414.05,1422.3,1410.53


### `Target 15D`

In [21]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1034, 42)
y.shape: (1034,)


In [22]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [23]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.302965,0.01953,0.043503,0.001179,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.009085,-0.194554,-0.26167,-0.35707,-0.111869,-0.183216,0.125363,1
8,0.29818,0.010406,0.043273,0.003184,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.013014,-0.216282,-0.272611,-0.377861,-0.137009,-0.19815,0.131528,2
4,0.271843,0.021393,0.032411,0.001233,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",0.008348,-0.159439,-0.334981,-0.315715,-0.192275,-0.198812,0.123868,3
0,0.395202,0.064694,0.047619,0.004686,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.000501,-0.220673,-0.278967,-0.320393,-0.197104,-0.203528,0.110366,4
11,0.341055,0.016997,0.050369,0.006646,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.05208,-0.294063,-0.26231,-0.382759,-0.191988,-0.215808,0.147302,5


In [24]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.065
R2: 0.413
MSE: 0.050
MAE: 0.039


In [25]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1024,2024-01-20,1478.85,1394.45,1444.63
1025,2024-01-23,1427.35,1384.05,1375.66
1026,2024-01-24,1455.9,1414.05,1427.91
1027,2024-01-25,1434.9,1419.9,1404.85
1028,2024-01-29,1454.65,1417.1,1431.8
1029,2024-01-30,1444.3,1454.3,1421.64
1030,2024-01-31,1462.55,1439.15,1434.88
1031,2024-02-01,1466.35,1419.55,1432.93
1032,2024-02-02,1446.15,1420.6,1415.09
1033,2024-02-05,1444.85,1422.3,1408.98


### `Target 30D`

In [26]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (1019, 42)
y.shape: (1019,)


In [27]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 1.0,
 'max_features': 'sqrt',
 'max_depth': 4}

In [28]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,0.369484,0.047072,0.05007,0.006901,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.013865,-0.194606,-0.678875,-0.308763,-0.158778,-0.270978,0.224649,1
9,0.340228,0.013707,0.047091,0.005511,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.010018,-0.243402,-0.623067,-0.34536,-0.155628,-0.271488,0.210961,2
0,0.422674,0.088246,0.048402,0.005345,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.083865,-0.187196,-0.797842,-0.308726,-0.1107,-0.297666,0.26198,3
1,0.247257,0.01304,0.032997,0.00498,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.030525,-0.298846,-0.766485,-0.281514,-0.145247,-0.304523,0.250764,4
5,0.447679,0.023664,0.055891,0.002815,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.015921,-0.30558,-0.817705,-0.291289,-0.152504,-0.3166,0.271737,5


In [29]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.089
R2: 0.566
MSE: 0.058
MAE: 0.046


In [30]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
1009,2024-01-01,1698.1,1394.45,1555.5
1010,2024-01-02,1699.1,1384.05,1548.74
1011,2024-01-03,1672.9,1414.05,1536.42
1012,2024-01-04,1690.85,1419.9,1544.91
1013,2024-01-05,1682.2,1417.1,1546.82
1014,2024-01-08,1663.45,1454.3,1530.25
1015,2024-01-09,1650.5,1439.15,1519.72
1016,2024-01-10,1655.95,1419.55,1526.09
1017,2024-01-11,1649.0,1420.6,1525.22
1018,2024-01-12,1641.2,1422.3,1501.76


## Feature importances

In [31]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
52W H,11.1,8.0,8.0,10.2,9.3
Range 60MA,4.5,9.2,6.4,10.3,7.6
DayOfYear,3.4,6.2,9.5,10.1,7.3
PE,2.4,6.7,3.3,8.6,5.2
Month,0.8,2.9,4.5,8.9,4.3
Range 30MA,2.5,3.0,3.0,5.9,3.6
VWAP 60MA,7.1,2.5,2.2,2.5,3.6
52W L,1.6,2.7,3.8,5.7,3.5
DaysSinceLastTradingSession,0.3,0.4,8.4,4.5,3.4
Range 15MA,3.0,3.2,4.5,2.8,3.4


## Forecasts

In [32]:
stock_data.processed.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1039,2024-02-13,1394.45,1394.84,1403.42,1369.28,1355.84
1040,2024-02-14,1384.05,1387.28,1386.82,1357.87,1344.51
1041,2024-02-15,1414.05,1412.28,1410.53,1380.65,1379.96
1042,2024-02-16,1419.9,1417.4,1413.3,1380.96,1391.44
1043,2024-02-19,1417.1,1414.63,1411.54,1380.72,1387.63
1044,2024-02-20,1454.3,1448.95,1447.4,1413.57,1413.83
1045,2024-02-21,1439.15,1436.6,1437.43,1401.06,1404.44
1046,2024-02-22,1419.55,1416.02,1425.17,1384.0,1392.53
1047,2024-02-23,1420.6,1418.47,1418.49,1383.91,1391.3
1048,2024-02-26,1422.3,1417.85,1419.63,1379.94,1391.24


In [33]:
latest_preds = stock_data.processed.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-02-26
Close: 1422.3
Pred Target 3D: 1417.85 ± 38.82
Pred Target 7D: 1419.63 ± 55.65
Pred Target 15D: 1379.94 ± 71.03
Pred Target 30D: 1391.24 ± 82.94
