# 04 - Forecasting using Random Forests

In [1]:
import sys
sys.executable

'/usr/local/bin/python'

## Imports

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import stock_utils.constants as cnst
import stock_utils.data as sd

pd.set_option('display.max_columns', None)

NSE_DATA_DIR = PosixPath('../data/NSE') | Valid: True
SCREENER_EPS_DATA_DIR = PosixPath('../data/Screener.in/EPS') | Valid: True
PROCESSED_DATA_DIR = PosixPath('../data/processed') | Valid: True


## Constants

In [3]:
stock_symbols = sd.get_all_stock_symbols(
    cnst.NSE_DATA_DIR
)

stock_symbols

['DEEPAKFERT', 'HDFCBANK', 'INDIGOPNTS', 'ITBEES', 'JUBLFOOD']

In [4]:
STOCK_SYMBOL = stock_symbols[2]
STOCK_SYMBOL

'INDIGOPNTS'

## Loading stock data

In [5]:
stock_data = sd.StockData(STOCK_SYMBOL)
stock_data

Symbol: INDIGOPNTS
Total records: 775
First record: 2021-02-02
Last record: 2024-03-15

## Modelling

### Target columns

In [6]:
target_cols = stock_data.standardized.filter(regex = "Target.*").columns.to_list()
target_cols

['Target 3D', 'Target 7D', 'Target 15D', 'Target 30D']

In [7]:
stock_data.standardized[target_cols].describe()

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D
count,772.0,768.0,760.0,745.0
mean,0.997374,0.994462,0.989138,0.980921
std,0.032186,0.04671,0.067656,0.103178
min,0.844,0.831,0.786,0.741
25%,0.98,0.966,0.946,0.914
50%,0.995,0.99,0.981,0.97
75%,1.011,1.017,1.02425,1.029
max,1.13,1.206,1.246,1.399


### Data processing

In [8]:
def get_training_data(target_col: str):
    print(f"Target: {target_col}")
    X_df = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ].drop(columns = target_cols).copy()
    y = stock_data.standardized[
        stock_data.standardized[target_col].notnull()
    ][target_col].copy()
    print(f"X.shape: {X_df.shape}")
    print(f"y.shape: {y.shape}")

    return X_df, y

### Grid search parameters

In [9]:
param_dict = {
    "n_estimators": [75, 100, 125],
    "max_depth": [4, 5, 6],
    "max_features": ["log2", "sqrt", 0.25],
    "max_samples": [0.75, 1.0]
}

feature_importances = pd.DataFrame(
    index = stock_data.standardized.drop(columns = target_cols).columns.to_list()
)

expected_errors = {}

### Model building

In [10]:
def get_model():
    rf_model = RandomForestRegressor(
        criterion = "squared_error",
        n_jobs = -1,
        random_state = cnst.RANDOM_STATE
    )

    grid_cv = RandomizedSearchCV(
        rf_model,
        param_dict,
        n_iter = 12,
        cv = 5, 
        random_state = cnst.RANDOM_STATE
    )

    return grid_cv

def print_results(y, preds):
    print(f"Target std: {y.std():.3f}")
    print(f"R2: {r2_score(y, preds):.3f}")
    print(f"MSE: {mean_squared_error(y, preds, squared = False):.3f}")
    print(f"MAE: {mean_absolute_error(y, preds):.3f}")

def expected_error(y, preds):
    return mean_squared_error(y, preds, squared = False)

## Training

### `Target 3D`

In [11]:
target_col = target_cols[0]
X, y = get_training_data(target_col)

Target: Target 3D
X.shape: (772, 43)
y.shape: (772,)


In [12]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 0.25,
 'max_depth': 4}

In [13]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.378923,0.007653,0.043545,0.001786,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",0.037918,-0.016509,0.019163,-0.087376,-0.003063,-0.009973,0.042951,1
5,0.332034,0.009606,0.048154,0.009602,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",0.050183,0.005791,0.034823,-0.136213,-0.009361,-0.010955,0.066039,2
3,0.2328,0.015964,0.03533,0.007401,75,1.0,0.25,4,"{'n_estimators': 75, 'max_samples': 1.0, 'max_...",0.054443,-0.037365,0.015795,-0.107094,-0.027441,-0.020332,0.054341,3
8,0.334937,0.044103,0.042346,0.00805,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.027182,-0.019435,0.008719,-0.115706,-0.005765,-0.021001,0.049817,4
10,0.487795,0.009812,0.047805,0.005165,125,1.0,log2,6,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",0.04495,-0.015318,0.027864,-0.11572,-0.056981,-0.023041,0.058335,5


In [14]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.032
R2: 0.280
MSE: 0.027
MAE: 0.020


In [15]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 3D,Pred Target 3D
762,2024-02-28,1403.4,1408.15,1398.02
763,2024-02-29,1398.85,1386.35,1393.52
764,2024-03-01,1400.95,1388.45,1392.5
765,2024-03-02,1408.15,1374.2,1401.96
766,2024-03-04,1386.35,1363.55,1377.13
767,2024-03-05,1388.45,1367.85,1384.14
768,2024-03-06,1374.2,1337.2,1366.12
769,2024-03-07,1363.55,1308.65,1354.94
770,2024-03-11,1367.85,1329.0,1360.32
771,2024-03-12,1337.2,1341.45,1332.85


### `Target 7D`

In [16]:
target_col = target_cols[1]
X, y = get_training_data(target_col)

Target: Target 7D
X.shape: (768, 43)
y.shape: (768,)


In [17]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 4}

In [18]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.272849,0.019683,0.040879,0.003948,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.019792,-0.115362,-0.007379,-0.210934,-0.143751,-0.099444,0.076768,1
4,0.301757,0.055929,0.030563,0.000579,75,0.75,log2,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.029399,-0.111115,0.008629,-0.202663,-0.180662,-0.103042,0.082351,2
11,0.274985,0.014247,0.038561,0.006406,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.127721,-0.072387,0.021213,-0.183901,-0.18695,-0.109949,0.077882,3
0,0.399678,0.027886,0.044215,0.003243,100,1.0,log2,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.041461,-0.092471,0.002287,-0.303025,-0.127296,-0.112393,0.105016,4
8,0.276019,0.013961,0.042747,0.004867,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.053957,-0.110719,-0.028149,-0.237967,-0.136757,-0.11351,0.073309,5


In [19]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.047
R2: 0.372
MSE: 0.037
MAE: 0.028


In [20]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 7D,Pred Target 7D
758,2024-02-22,1451.05,1408.15,1428.07
759,2024-02-23,1443.5,1386.35,1422.51
760,2024-02-26,1427.8,1388.45,1408.43
761,2024-02-27,1423.4,1374.2,1404.52
762,2024-02-28,1403.4,1363.55,1383.42
763,2024-02-29,1398.85,1367.85,1386.0
764,2024-03-01,1400.95,1337.2,1377.64
765,2024-03-02,1408.15,1308.65,1380.56
766,2024-03-04,1386.35,1329.0,1372.94
767,2024-03-05,1388.45,1341.45,1376.19


### `Target 15D`

In [21]:
target_col = target_cols[2]
X, y = get_training_data(target_col)

Target: Target 15D
X.shape: (760, 43)
y.shape: (760,)


In [22]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 125,
 'max_samples': 0.75,
 'max_features': 0.25,
 'max_depth': 5}

In [23]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,0.374931,0.019314,0.046026,0.004717,125,0.75,0.25,5,"{'n_estimators': 125, 'max_samples': 0.75, 'ma...",-0.068783,-0.119798,0.090969,0.138721,0.028335,0.013889,0.096375,1
7,0.24111,0.019349,0.032606,0.001345,75,0.75,0.25,5,"{'n_estimators': 75, 'max_samples': 0.75, 'max...",-0.107759,-0.147207,0.104029,0.122333,0.016973,-0.002326,0.108931,2
5,0.362315,0.026023,0.045549,0.004474,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",0.020884,-0.204366,0.061035,0.107479,-0.004076,-0.003809,0.107166,3
8,0.30496,0.044281,0.040139,0.003444,100,1.0,sqrt,4,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",0.112215,-0.234586,0.009139,0.077678,0.011649,-0.004781,0.121459,4
2,0.346893,0.018146,0.043899,0.002061,100,1.0,0.25,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.082012,-0.163671,0.081917,0.122916,-0.00191,-0.008552,0.104853,5


In [24]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.068
R2: 0.713
MSE: 0.036
MAE: 0.028


In [25]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 15D,Pred Target 15D
750,2024-02-12,1449.4,1408.15,1392.02
751,2024-02-13,1474.4,1386.35,1397.81
752,2024-02-14,1458.4,1388.45,1402.4
753,2024-02-15,1499.6,1374.2,1402.55
754,2024-02-16,1494.7,1363.55,1393.12
755,2024-02-19,1511.8,1367.85,1405.11
756,2024-02-20,1503.5,1337.2,1400.92
757,2024-02-21,1492.05,1308.65,1393.28
758,2024-02-22,1451.05,1329.0,1388.29
759,2024-02-23,1443.5,1341.45,1394.66


### `Target 30D`

In [26]:
target_col = target_cols[3]
X, y = get_training_data(target_col)

Target: Target 30D
X.shape: (745, 43)
y.shape: (745,)


In [27]:
model = get_model()
model.fit(X, y)
model.best_params_

{'n_estimators': 100, 'max_samples': 1.0, 'max_features': 0.25, 'max_depth': 5}

In [28]:
pd.DataFrame(model.cv_results_).sort_values('rank_test_score').iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_samples,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.296929,0.025054,0.041522,0.001926,100,1.0,0.25,5,"{'n_estimators': 100, 'max_samples': 1.0, 'max...",-0.282838,-0.369066,0.052453,0.068979,-0.295786,-0.165251,0.186904,1
9,0.259801,0.007272,0.039454,0.002553,100,0.75,log2,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.074957,-0.874346,-0.132123,-0.007625,-0.248858,-0.267582,0.313543,2
6,0.355419,0.011469,0.045871,0.005692,125,0.75,0.25,5,"{'n_estimators': 125, 'max_samples': 0.75, 'ma...",-0.332737,-0.483939,0.05001,0.081201,-0.721201,-0.281333,0.30932,3
11,0.270634,0.009856,0.040086,0.006229,100,0.75,0.25,4,"{'n_estimators': 100, 'max_samples': 0.75, 'ma...",-0.24093,-0.573692,0.012525,0.033261,-0.641132,-0.281994,0.283507,4
5,0.36816,0.022471,0.051562,0.008356,125,1.0,sqrt,5,"{'n_estimators': 125, 'max_samples': 1.0, 'max...",-0.024348,-0.785331,-0.095174,0.025793,-0.570729,-0.289958,0.326317,5


In [29]:
feature_importances = feature_importances.join(
    pd.DataFrame(
        model.best_estimator_.feature_importances_,
        index = model.best_estimator_.feature_names_in_,
        columns = [target_col]
    )
)

preds = model.predict(X)
print_results(y, preds)

Target std: 0.103
R2: 0.874
MSE: 0.037
MAE: 0.028


In [30]:
pred_col_name = f'Pred {target_col}'
expected_errors[pred_col_name] = expected_error(y, preds)

stock_data.processed[pred_col_name] = (
    model.predict(
        stock_data.standardized.drop(columns = target_cols)
    ) * stock_data.processed['Close']
).round(2)
stock_data.processed[['Date', 'Close', target_col, pred_col_name]].dropna().iloc[-10:, :]

Unnamed: 0,Date,Close,Target 30D,Pred Target 30D
735,2024-01-19,1469.25,1408.15,1404.71
736,2024-01-20,1460.5,1386.35,1390.15
737,2024-01-23,1416.0,1388.45,1364.48
738,2024-01-24,1431.1,1374.2,1369.6
739,2024-01-25,1450.0,1363.55,1376.82
740,2024-01-29,1465.4,1367.85,1396.41
741,2024-01-30,1448.15,1337.2,1369.46
742,2024-01-31,1443.45,1308.65,1361.97
743,2024-02-01,1421.6,1329.0,1355.87
744,2024-02-02,1418.9,1341.45,1354.38


## Feature importances

In [31]:
feature_importances['Mean'] = feature_importances.mean(axis = 1)
(feature_importances.sort_values('Mean', ascending = False) * 100).round(1)

Unnamed: 0,Target 3D,Target 7D,Target 15D,Target 30D,Mean
PE,11.7,13.5,26.1,25.3,19.1
DayOfYear,9.4,8.6,15.5,16.6,12.5
Month,1.8,7.3,9.4,12.3,7.7
52W H,5.4,7.8,6.0,5.8,6.3
Range 60MA,2.5,2.7,3.5,7.4,4.0
VWAP 60MA,5.3,3.7,2.8,2.6,3.6
Close 60MA,4.7,3.7,2.6,1.8,3.2
Range 7MA,6.4,3.3,1.8,1.3,3.2
Range 15MA,5.8,2.2,2.7,1.7,3.1
Close 30MA,2.4,6.0,2.5,1.4,3.1


## Forecasts

In [32]:
stock_data.processed.filter(regex = "(Date)|(Close$)|(Pred.*)").iloc[-10:, :]

Unnamed: 0,Date,Close,Pred Target 3D,Pred Target 7D,Pred Target 15D,Pred Target 30D
765,2024-03-02,1408.15,1401.96,1380.56,1408.91,1454.51
766,2024-03-04,1386.35,1377.13,1372.94,1431.07,1447.98
767,2024-03-05,1388.45,1384.14,1376.19,1414.47,1450.21
768,2024-03-06,1374.2,1366.12,1374.99,1443.02,1436.79
769,2024-03-07,1363.55,1354.94,1364.49,1448.95,1432.51
770,2024-03-11,1367.85,1360.32,1366.21,1443.18,1432.15
771,2024-03-12,1337.2,1332.85,1341.09,1437.29,1416.96
772,2024-03-13,1308.65,1308.1,1313.98,1404.42,1410.02
773,2024-03-14,1329.0,1326.19,1331.97,1424.8,1415.03
774,2024-03-15,1341.45,1339.45,1351.57,1454.64,1429.64


In [33]:
latest_preds = stock_data.processed.iloc[-1]
print(f"Date: {latest_preds['Date'].date()}")
print(f"Close: {latest_preds['Close']}")

for pred in expected_errors.keys():
    print(f"{pred}: {latest_preds[pred]} ± {expected_errors[pred] * latest_preds['Close']:.2f}")

Date: 2024-03-15
Close: 1341.45
Pred Target 3D: 1339.45 ± 36.61
Pred Target 7D: 1351.57 ± 49.61
Pred Target 15D: 1454.64 ± 48.62
Pred Target 30D: 1429.64 ± 49.09
