In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np

def objective(params, X, y):
    model = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
                                  max_depth=int(params["max_depth"]),
                                  min_samples_split=int(params["min_samples_split"]),
                                  min_samples_leaf=int(params["min_samples_leaf"]),
                                  random_state=42,
                                  n_jobs=-1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return {"loss": rmse, "status": STATUS_OK}

space = {
    "n_estimators": hp.quniform("n_estimators", 50, 200, 10),
    "max_depth": hp.quniform("max_depth", 5, 20, 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
    "min_samples_leaf": hp.quniform("min_samples_leaf", 1, 5, 1),
}

combined_df = pd.read_csv("data/combined_data_and_features.csv")
pca_df = pd.read_csv("data/pca_features.csv")

combined_df["date"] = pd.to_datetime(combined_df["date"])
pca_df["date"] = pd.to_datetime(pca_df["date"])



merged_data = pd.merge(pca_df, combined_df[["company", "date", "close"]], on=["company", "date"], how="inner")


pre_covid_end = pd.to_datetime("2019-12-31")
covid_start = pd.to_datetime("2020-01-01")
covid_end = pd.to_datetime("2021-12-31")
post_covid_start = pd.to_datetime("2022-01-01")



periods = {
    "pre_covid": merged_data[merged_data["date"] <= pre_covid_end],
    "covid": merged_data[(merged_data["date"] >= covid_start) & (merged_data["date"] <= covid_end)],
    "post_covid": merged_data[merged_data["date"] >= post_covid_start]
}

results = {}
for period_name, df in periods.items():
    print(f"Initial {period_name} data shape: {df.shape}")

    if df.empty:
        print(f"No sufficient data for {period_name}. Skipping Hyperopt.")
        results[period_name] = ""
        continue


    feature_cols = [col for col in df.columns if "principal_component" in col]
    X = df[feature_cols]
    y = df["close"]


    combined_xy = pd.concat([X, y], axis=1).dropna()
    X = combined_xy[feature_cols]
    y = combined_xy["close"]

    if X.empty:
        print(f"No sufficient data for {period_name} after dropping NaNs. Skipping Hyperopt.")
        results[period_name] = ""
        continue


    trials = Trials()
    best = fmin(fn=lambda p: objective(p, X, y),
                space=space,
                algo=tpe.suggest,
                max_evals=50,
                trials=trials)
    
    results[period_name] = best
    print(f"Best hyperparameters for {period_name}: {best}")


for period, best_params in results.items():
    print(f"{period}: {best_params}")





--- Running Hyperopt for pre_covid period ---
Initial pre_covid data shape: (128, 42)
100%|██████████| 50/50 [00:03<00:00, 16.60trial/s, best loss: 507.0688780837199]
Best hyperparameters for pre_covid: {'max_depth': 9.0, 'min_samples_leaf': 3.0, 'min_samples_split': 4.0, 'n_estimators': 50.0}

--- Running Hyperopt for covid period ---
Initial covid data shape: (22, 42)
100%|██████████| 50/50 [00:03<00:00, 13.74trial/s, best loss: 615.7985162953983]
Best hyperparameters for covid: {'max_depth': 7.0, 'min_samples_leaf': 3.0, 'min_samples_split': 9.0, 'n_estimators': 160.0}

--- Running Hyperopt for post_covid period ---
Initial post_covid data shape: (0, 42)
No sufficient data for post_covid. Skipping Hyperopt.

--- Hyperopt Optimization Results ---
pre_covid: {'max_depth': 9.0, 'min_samples_leaf': 3.0, 'min_samples_split': 4.0, 'n_estimators': 50.0}
covid: {'max_depth': 7.0, 'min_samples_leaf': 3.0, 'min_samples_split': 9.0, 'n_estimators': 160.0}
post_covid: Skipped due to insufficie