# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error

import stock_utils.constants as cnst
import stock_utils.data as sd

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
SCREENER_EPS_DATA_DIR = PosixPath('../data/Screener.in/EPS') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = sd.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD', 'NH']

In [4]:
STOCK_SYMBOL = stock_symbols[1]
STOCK_SYMBOL

'HDFCBANK'

## Loading stock data

In [5]:
stock_data = sd.StockData(STOCK_SYMBOL)
stock_data

Symbol: HDFCBANK
Total records: 1097
First record: 2020-01-01
Last record: 2024-05-09

## Modelling

### Target columns

In [6]:
target_cols = stock_data.standardized.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [7]:
stock_data.standardized[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1094.0,1090.0,1082.0,1067.0
mean,1.000856,1.002144,1.00449,1.009055
std,0.030511,0.04568,0.064045,0.087293
min,0.82,0.718,0.668,0.629
25%,0.986,0.978,0.96825,0.961
50%,1.0,1.002,1.0035,1.008
75%,1.016,1.025,1.04175,1.057
max,1.178,1.219,1.231,1.322


### Data processing

In [8]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ].drop(columns = target_cols).copy()
    y = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [9]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = stock_data.standardized.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [10]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {root_mean_squared_error(y, preds):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return root_mean_squared_error(y, preds)

## Training

### `Target 3D`

In [11]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1094, 43)
y.shape: (1094,)


In [12]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [13]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.277629,0.035036,0.0623,0.017133,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.009058,-0.025443,-0.004481,-0.011058,0.007005,-0.008607,0.010492,1
8,0.276722,0.019009,0.051737,0.00635,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.019188,-0.014869,-0.0154,-0.011873,-0.001954,-0.012657,0.005835,2
11,0.249389,0.019759,0.046124,0.004891,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.030489,-0.024542,-0.015106,-0.016061,0.001096,-0.01702,0.010684,3
0,0.309388,0.055052,0.054712,0.006897,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.055402,-0.010189,-0.01695,-0.01306,0.005021,-0.018116,0.020077,4
4,0.2058,0.020833,0.037303,0.004884,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.022251,-0.032574,-0.017783,-0.014485,-0.014931,-0.020405,0.006684,5


In [14]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.031
R2: 0.201
MSE: 0.027
MAE: 0.020


In [15]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1084,2024-04-22,1512.2,1510.75,1512.36
1085,2024-04-23,1507.6,1509.8,1507.52
1086,2024-04-24,1511.7,1529.5,1511.44
1087,2024-04-25,1510.75,1520.1,1511.49
1088,2024-04-26,1509.8,1532.25,1510.61
1089,2024-04-29,1529.5,1519.6,1531.24
1090,2024-04-30,1520.1,1522.65,1521.66
1091,2024-05-02,1532.25,1506.15,1532.74
1092,2024-05-03,1519.6,1482.65,1519.86
1093,2024-05-06,1522.65,1447.5,1522.8


### `Target 7D`

In [16]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1090, 43)
y.shape: (1090,)


In [17]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [18]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.246277,0.020449,0.046189,0.003061,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.037433,-0.095369,-0.028301,-0.027939,0.032941,-0.03122,0.040717,1
4,0.189452,0.007182,0.039229,0.00453,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.045455,-0.099527,-0.053043,-0.030762,0.024189,-0.040919,0.039861,2
8,0.303106,0.034815,0.052529,0.006353,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.066876,-0.131874,-0.047498,-0.030493,0.027253,-0.049897,0.051658,3
0,0.315822,0.031487,0.055271,0.008454,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.042799,-0.156294,-0.044024,-0.02717,0.020324,-0.049993,0.058056,4
10,0.357625,0.018627,0.065274,0.009511,125,1.0,log2,6,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.038974,-0.166789,-0.079284,-0.048563,0.043493,-0.058024,0.067882,5


In [19]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.046
R2: 0.279
MSE: 0.039
MAE: 0.028


In [20]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1080,2024-04-15,1494.7,1510.75,1495.24
1081,2024-04-16,1509.25,1509.8,1508.35
1082,2024-04-18,1494.7,1529.5,1496.19
1083,2024-04-19,1531.3,1520.1,1530.38
1084,2024-04-22,1512.2,1532.25,1514.64
1085,2024-04-23,1507.6,1519.6,1509.99
1086,2024-04-24,1511.7,1522.65,1516.06
1087,2024-04-25,1510.75,1506.15,1514.21
1088,2024-04-26,1509.8,1482.65,1512.94
1089,2024-04-29,1529.5,1447.5,1530.63


### `Target 15D`

In [21]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1082, 43)
y.shape: (1082,)


In [22]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [23]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.241323,0.026083,0.045781,0.004514,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.018736,-0.163683,-0.131792,-0.102372,-0.018999,-0.087117,0.059003,1
11,0.303201,0.034938,0.05771,0.009238,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.041046,-0.170329,-0.161971,-0.111912,-0.01336,-0.099724,0.063103,2
7,0.191279,0.010649,0.038866,0.00525,75,0.75,0.25,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.084604,-0.132202,-0.201168,-0.08386,-0.008456,-0.102058,0.063448,3
3,0.192592,0.009971,0.039279,0.005763,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.086553,-0.199231,-0.130295,-0.125827,0.028724,-0.102636,0.07503,4
0,0.297733,0.031331,0.055163,0.005133,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.078499,-0.163983,-0.174112,-0.095494,-0.00318,-0.103054,0.062284,5


In [24]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.064
R2: 0.403
MSE: 0.049
MAE: 0.038


In [25]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1072,2024-04-02,1480.15,1510.75,1479.49
1073,2024-04-03,1482.3,1509.8,1482.29
1074,2024-04-04,1527.6,1529.5,1518.08
1075,2024-04-05,1549.55,1520.1,1542.97
1076,2024-04-08,1546.6,1532.25,1535.94
1077,2024-04-09,1548.55,1519.6,1539.62
1078,2024-04-10,1536.35,1522.65,1528.43
1079,2024-04-12,1518.95,1506.15,1514.38
1080,2024-04-15,1494.7,1482.65,1498.15
1081,2024-04-16,1509.25,1447.5,1505.04


### `Target 30D`

In [26]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (1067, 43)
y.shape: (1067,)


In [27]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 75, 'max_samples': 1.0, 'max_features': 0.25, 'max_depth': 4}

In [28]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.210057,0.042465,0.040489,0.006796,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.02358,0.096147,-0.653056,-0.162674,0.013155,-0.13657,0.271905,1
0,0.307032,0.03612,0.056592,0.001975,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.059328,0.152382,-0.726041,-0.142814,-0.03596,-0.138621,0.309672,2
8,0.297456,0.026342,0.066665,0.023174,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.034604,0.08062,-0.56558,-0.194075,-0.049519,-0.13879,0.232973,3
11,0.261441,0.01308,0.049955,0.005932,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.02251,0.058691,-0.649658,-0.17501,0.007943,-0.147105,0.264007,4
9,0.305468,0.013924,0.056513,0.006402,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.002042,0.040183,-0.539293,-0.193481,-0.055944,-0.149299,0.210509,5


In [29]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.087
R2: 0.625
MSE: 0.053
MAE: 0.043


In [30]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
1057,2024-03-07,1446.1,1510.75,1479.34
1058,2024-03-11,1427.8,1509.8,1460.06
1059,2024-03-12,1459.55,1529.5,1480.41
1060,2024-03-13,1460.4,1520.1,1487.19
1061,2024-03-14,1455.45,1532.25,1483.79
1062,2024-03-15,1452.65,1519.6,1473.63
1063,2024-03-18,1446.05,1522.65,1476.2
1064,2024-03-19,1449.35,1506.15,1476.73
1065,2024-03-20,1431.05,1482.65,1456.67
1066,2024-03-21,1445.75,1447.5,1465.97


## Feature importances

In [31]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
52W H,7.1,9.4,10.1,13.6,10.1
Range 60MA,5.3,9.9,5.7,10.8,7.9
PE,3.5,5.3,6.9,11.8,6.9
DayOfYear,1.6,4.9,7.4,11.9,6.5
DaysSinceLastTradingSession,0.2,0.4,9.6,7.9,4.5
VWAP 60MA,6.4,3.1,2.2,2.7,3.6
Range 7MA,4.4,5.4,2.4,2.0,3.6
Range 15MA,2.6,3.4,4.0,4.1,3.5
Close 60MA,5.0,3.4,3.0,2.6,3.5
Month,0.5,2.0,5.2,5.8,3.4


## Forecasts

In [32]:
stock_data.processed.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1087,2024-04-25,1510.75,1511.49,1514.21,1504.91,1529.5
1088,2024-04-26,1509.8,1510.61,1512.94,1503.74,1531.43
1089,2024-04-29,1529.5,1531.24,1530.63,1522.41,1521.52
1090,2024-04-30,1520.1,1521.66,1521.95,1515.11,1520.84
1091,2024-05-02,1532.25,1532.74,1530.68,1525.94,1528.06
1092,2024-05-03,1519.6,1519.86,1519.3,1520.04,1532.36
1093,2024-05-06,1522.65,1522.8,1522.17,1527.94,1528.0
1094,2024-05-07,1506.15,1506.62,1507.44,1522.69,1551.91
1095,2024-05-08,1482.65,1483.5,1485.51,1507.0,1545.83
1096,2024-05-09,1447.5,1448.3,1457.39,1479.56,1514.72


In [33]:
latest_preds = stock_data.processed.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-05-09
Close: 1447.5
Pred Target 3D: 1448.3 ± 39.46
Pred Target 7D: 1457.39 ± 56.11
Pred Target 15D: 1479.56 ± 71.60
Pred Target 30D: 1514.72 ± 77.36
