# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import stock_utils.constants as cnst
import stock_utils.data as sd

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
SCREENER_EPS_DATA_DIR = PosixPath('../data/Screener.in/EPS') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = sd.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD', 'NH']

In [4]:
STOCK_SYMBOL = stock_symbols[5]
STOCK_SYMBOL

'NH'

## Loading stock data

In [5]:
stock_data = sd.StockData(STOCK_SYMBOL)
stock_data

Symbol: NH
Total records: 1087
First record: 2020-01-01
Last record: 2024-04-26

## Modelling

### Target columns

In [6]:
target_cols = stock_data.standardized.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [7]:
stock_data.standardized[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,1084.0,1080.0,1072.0,1057.0
mean,1.004396,1.009989,1.021326,1.041851
std,0.036063,0.052965,0.07481,0.100385
min,0.788,0.726,0.705,0.644
25%,0.98375,0.98,0.974,0.98
50%,1.002,1.004,1.015,1.04
75%,1.021,1.03825,1.065,1.102
max,1.173,1.285,1.265,1.393


### Data processing

In [8]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ].drop(columns = target_cols).copy()
    y = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [9]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = stock_data.standardized.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [10]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [11]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (1084, 42)
y.shape: (1084,)


In [12]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [13]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.395707,0.038837,0.049106,0.004792,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.032453,0.001232,0.012817,-0.051736,-0.01258,-0.016544,0.023175,1
4,0.319432,0.035391,0.040192,0.006882,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.022015,0.000519,0.002843,-0.069614,-0.025638,-0.022781,0.026083,2
0,0.477026,0.065897,0.056947,0.014722,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.037998,-0.008177,0.006989,-0.07766,-0.023013,-0.027972,0.029011,3
8,0.382104,0.019461,0.063024,0.026586,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.037316,-0.024752,0.005422,-0.078584,-0.021747,-0.031396,0.027403,4
3,0.296757,0.015422,0.041311,0.005747,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",-0.06953,-0.015357,0.005314,-0.072024,-0.011267,-0.032573,0.031962,5


In [14]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.036
R2: 0.171
MSE: 0.033
MAE: 0.025


In [15]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
1074,2024-04-08,1325.25,1288.4,1324.81
1075,2024-04-09,1317.7,1269.15,1318.64
1076,2024-04-10,1313.05,1277.5,1315.67
1077,2024-04-12,1288.4,1238.5,1292.85
1078,2024-04-15,1269.15,1242.45,1275.2
1079,2024-04-16,1277.5,1255.55,1281.6
1080,2024-04-18,1238.5,1251.3,1245.17
1081,2024-04-19,1242.45,1262.6,1249.1
1082,2024-04-22,1255.55,1266.55,1262.05
1083,2024-04-23,1251.3,1275.5,1257.9


### `Target 7D`

In [16]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (1080, 42)
y.shape: (1080,)


In [17]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [18]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.374014,0.033945,0.045183,0.001903,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.009973,-0.058103,-0.036907,-0.104978,0.000741,-0.041844,0.037693,1
8,0.376497,0.045579,0.047527,0.00722,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.009224,-0.067602,-0.041245,-0.127246,0.010756,-0.046912,0.04829,2
1,0.342375,0.040877,0.044807,0.001492,75,1.0,sqrt,5,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.008678,-0.084058,-0.080558,-0.149902,0.007702,-0.059628,0.060636,3
4,0.269733,0.022583,0.038048,0.005932,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.007179,-0.071099,-0.111527,-0.149875,0.004347,-0.067067,0.059222,4
5,0.572898,0.053501,0.064735,0.005415,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",0.002661,-0.08856,-0.104491,-0.186561,0.017745,-0.071841,0.074943,5


In [19]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.053
R2: 0.268
MSE: 0.045
MAE: 0.035


In [20]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
1070,2024-04-02,1292.45,1288.4,1297.04
1071,2024-04-03,1290.5,1269.15,1295.59
1072,2024-04-04,1289.3,1277.5,1294.36
1073,2024-04-05,1286.25,1238.5,1291.19
1074,2024-04-08,1325.25,1242.45,1327.26
1075,2024-04-09,1317.7,1255.55,1322.32
1076,2024-04-10,1313.05,1251.3,1317.85
1077,2024-04-12,1288.4,1262.6,1293.42
1078,2024-04-15,1269.15,1266.55,1276.91
1079,2024-04-16,1277.5,1275.5,1282.47


### `Target 15D`

In [21]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (1072, 42)
y.shape: (1072,)


In [22]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [23]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.349907,0.037354,0.047286,0.006614,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.030011,-0.061798,-0.226328,-0.195572,0.012516,-0.100239,0.093926,1
8,0.325199,0.003324,0.043621,0.001122,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.014514,-0.083437,-0.277305,-0.197155,-0.016066,-0.117695,0.103863,2
4,0.265046,0.00433,0.037336,0.005265,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.032849,-0.018973,-0.364673,-0.253762,-0.03081,-0.140213,0.142458,3
0,0.482243,0.066364,0.052313,0.007031,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.035401,-0.091883,-0.363678,-0.281719,-0.012728,-0.157082,0.14008,4
10,0.630633,0.108118,0.068582,0.013474,125,1.0,log2,6,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.033333,-0.109363,-0.410255,-0.309252,-0.000751,-0.172591,0.16005,5


In [24]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.075
R2: 0.369
MSE: 0.059
MAE: 0.047


In [25]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
1062,2024-03-19,1223.3,1288.4,1243.53
1063,2024-03-20,1197.6,1269.15,1214.5
1064,2024-03-21,1226.2,1277.5,1245.16
1065,2024-03-22,1232.5,1238.5,1250.65
1066,2024-03-26,1251.65,1242.45,1256.68
1067,2024-03-27,1270.1,1255.55,1272.13
1068,2024-03-28,1283.8,1251.3,1287.66
1069,2024-04-01,1296.3,1262.6,1304.82
1070,2024-04-02,1292.45,1266.55,1300.43
1071,2024-04-03,1290.5,1275.5,1298.75


### `Target 30D`

In [26]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (1057, 42)
y.shape: (1057,)


In [27]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [28]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.337047,0.017546,0.045807,0.002666,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.258053,-0.039628,-0.528021,-0.114605,0.268149,-0.134431,0.261399,1
8,0.338979,0.024809,0.042383,0.001288,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.250049,-0.087118,-0.545124,-0.203888,0.297374,-0.157761,0.273069,2
0,0.420324,0.062286,0.049931,0.007202,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.23308,-0.060048,-0.638422,-0.253354,0.340911,-0.168799,0.31732,3
4,0.284261,0.023885,0.04131,0.004626,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.315123,-0.066219,-0.584735,-0.211413,0.328351,-0.169828,0.301299,4
5,0.465399,0.035497,0.054377,0.003021,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.250975,-0.103316,-0.727609,-0.24773,0.369013,-0.192123,0.351016,5


In [29]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.100
R2: 0.501
MSE: 0.071
MAE: 0.056


In [30]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
1047,2024-02-27,1381.65,1288.4,1354.36
1048,2024-02-28,1364.3,1269.15,1340.67
1049,2024-02-29,1341.95,1277.5,1323.44
1050,2024-03-01,1296.25,1238.5,1288.18
1051,2024-03-02,1319.9,1242.45,1297.16
1052,2024-03-04,1307.65,1255.55,1284.55
1053,2024-03-05,1246.4,1251.3,1267.14
1054,2024-03-06,1204.7,1262.6,1214.83
1055,2024-03-07,1179.75,1266.55,1192.9
1056,2024-03-11,1276.1,1275.5,1264.67


## Feature importances

In [31]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
52W L,4.0,4.3,8.1,16.8,8.3
DayOfYear,3.4,6.0,8.5,15.0,8.2
Month,1.3,2.8,6.6,8.3,4.7
Range 30MA,4.4,6.5,3.5,3.3,4.4
Range 7MA,3.5,4.5,6.8,2.9,4.4
Range 60MA,3.1,5.6,2.8,5.6,4.3
Range 15MA,3.5,6.0,4.4,2.6,4.1
Volume 15MA,5.7,8.5,1.7,0.5,4.1
Close 60MA,3.2,5.1,4.3,3.1,4.0
52W H,4.4,4.2,3.7,3.2,3.9


## Forecasts

In [32]:
stock_data.processed.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
1077,2024-04-12,1288.4,1292.85,1293.42,1301.85,1349.65
1078,2024-04-15,1269.15,1275.2,1276.91,1288.58,1344.91
1079,2024-04-16,1277.5,1281.6,1282.47,1297.97,1350.01
1080,2024-04-18,1238.5,1245.17,1247.46,1259.49,1318.36
1081,2024-04-19,1242.45,1249.1,1251.13,1261.81,1328.31
1082,2024-04-22,1255.55,1262.05,1263.9,1275.67,1343.89
1083,2024-04-23,1251.3,1257.9,1259.48,1271.63,1339.88
1084,2024-04-24,1262.6,1269.12,1270.5,1284.38,1350.93
1085,2024-04-25,1266.55,1272.31,1273.33,1288.85,1354.58
1086,2024-04-26,1275.5,1280.65,1282.65,1299.07,1365.75


In [33]:
latest_preds = stock_data.processed.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-04-26
Close: 1275.5
Pred Target 3D: 1280.65 ± 41.87
Pred Target 7D: 1282.65 ± 57.79
Pred Target 15D: 1299.07 ± 75.73
Pred Target 30D: 1365.75 ± 90.44
