# RandomForestRegressor, GridSearchCV
## Module import

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

## GridSearchCV training and testing

In [6]:
# Set the directory containing the CSV files
input_dir = '../../1-preprocessing/data/8-splits/USD'

# Create DataFrames to store the results
oob_results = pd.DataFrame(columns=['Offset', 'OOB Error'])
r2_results = pd.DataFrame(columns=['Offset', 'R2'])

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [1, 3, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

offset = '30'

train_data = pd.read_csv(os.path.join(input_dir, offset, "train_data.csv"))
test_data = pd.read_csv(os.path.join(input_dir, offset, "test_data.csv"))

# Split the data into features and target
X_train = train_data.iloc[:, 1:-1]
y_train = train_data.iloc[:, -1]
X_test = test_data.iloc[:, 1:-1]
y_test = test_data.iloc[:, -1]

# Initialize model
model = RandomForestRegressor(
    oob_score=True,
    random_state=0
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,             # 5-fold cross-validation
    scoring='r2',     # evaluation metric
    n_jobs=-1         # use all available CPU cores
)

# Fit the RandomForestRegressor model
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

# Calculate the Directional Symmetry (hit rate)
hit_rate = (np.sign(y_pred) == np.sign(y_test)).mean()

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best R^2 score: {grid_search.best_score_:.3f}")
print(f"Directional Symmetry (hit rate): {hit_rate:.2f}")

KeyboardInterrupt: 

## Plot test set and prediction set

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label='Test Set')
plt.plot(y_pred, label='Predictions')
plt.title(f'Test Set vs Predictions (Offset: {offset})')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend()
plt.show()