In [4]:
import os
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ============================================================
# 1. Load EDA-transformed daily file
# ============================================================

daily_csv_path = "../EDA/daily.csv"   # adjust if path differs
daily = pd.read_csv(daily_csv_path, parse_dates=["date"])

# Ensure sorted by company and date
daily = daily.sort_values(["company_name", "date"]).reset_index(drop=True)

# ============================================================
# 2. Define features and targets
# ============================================================

target_cols = ["open", "high", "low", "close"]

feature_cols = [
    "log_return_1d", "log_return_5d", "log_return_20d",
    "ma_5d", "ma_20d", "ma_60d",
    "vol_20d", "vol_60d",
    "vol_sum_5d", "vol_sum_20d",
    "close_over_ma_20d",
    "drawdown",
    "volume",
    "day_of_week", "month", "year",
]

feature_cols = [c for c in feature_cols if c in daily.columns]

print("Using features:", feature_cols)
print("Targets:", target_cols)

# Replace infinities globally with NaN
daily[feature_cols] = daily[feature_cols].replace([np.inf, -np.inf], np.nan)

# ============================================================
# 3. Per-company sequential 80:20 split and modelling
# ============================================================

results = []
metrics_list = []

companies = daily["company_name"].unique()

for cname in companies:
    df_c = daily[daily["company_name"] == cname].copy()
    df_c = df_c.sort_values("date")

    # Drop rows with missing or infinite features/targets
    df_c = df_c.dropna(subset=feature_cols + target_cols, how="any")
    if len(df_c) < 10:
        print(f"Skipping {cname}: too few rows after dropping NaNs/infs.")
        continue

    n = len(df_c)
    train_size = int(n * 0.8)
    train_df = df_c.iloc[:train_size]
    test_df = df_c.iloc[train_size:]

    X_train = train_df[feature_cols].values.astype(float)
    X_test = test_df[feature_cols].values.astype(float)

    print(f"\n=== {cname} ===")
    print("Train size:", len(train_df), "Test size:", len(test_df))

    company_metrics = {"company_name": cname}

    for target in target_cols:
        y_train = train_df[target].values.astype(float)
        y_test = test_df[target].values.astype(float)

        model = RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        tmp = pd.DataFrame({
            "company_name": cname,
            "date": test_df["date"].values,
            "target": target,
            "y_true": y_test,
            "y_pred": y_pred,
        })
        results.append(tmp)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        with np.errstate(divide="ignore", invalid="ignore"):
            mape_arr = np.abs((y_test - y_pred) / y_test) * 100
            mape = np.nanmean(mape_arr)

        company_metrics[f"{target}_RMSE"] = rmse
        company_metrics[f"{target}_MAE"] = mae
        company_metrics[f"{target}_MAPE"] = mape
        company_metrics[f"{target}_R2"] = r2

        print(
            f"  {target}: RMSE={rmse:.4f}, MAE={mae:.4f}, "
            f"MAPE={mape:.2f}%, R2={r2:.4f}"
        )

    metrics_list.append(company_metrics)

# ============================================================
# 4. Combine and save predictions/metrics
# ============================================================

predictions_df = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
metrics_df = pd.DataFrame(metrics_list)

print("\nPrediction samples:")
print(predictions_df.head())

print("\nPer-company metrics:")
print(metrics_df)

save_dir = "../Data_processed"
os.makedirs(save_dir, exist_ok=True)

predictions_path = os.path.join(save_dir, "daily_predictions_all_companies.csv")
metrics_path = os.path.join(save_dir, "daily_model_metrics_all_companies.csv")

predictions_df.to_csv(predictions_path, index=False)
metrics_df.to_csv(metrics_path, index=False)

print("\nSaved predictions to:", predictions_path)
print("Saved metrics to:", metrics_path)

Using features: ['log_return_1d', 'log_return_5d', 'log_return_20d', 'ma_5d', 'ma_20d', 'ma_60d', 'vol_20d', 'vol_60d', 'vol_sum_5d', 'vol_sum_20d', 'close_over_ma_20d', 'drawdown', 'volume', 'day_of_week', 'month', 'year']
Targets: ['open', 'high', 'low', 'close']

=== ADANIGREEN ===
Train size: 2060 Test size: 515
  open: RMSE=34.4677, MAE=23.0577, MAPE=1.75%, R2=0.9935
  high: RMSE=36.1404, MAE=24.2210, MAPE=1.71%, R2=0.9930
  low: RMSE=29.7278, MAE=19.3202, MAPE=1.47%, R2=0.9950
  close: RMSE=23.5491, MAE=12.3984, MAPE=0.94%, R2=0.9969

=== ADANIPOWER ===
Train size: 3045 Test size: 762
  open: RMSE=182.3694, MAE=152.0410, MAPE=25.38%, R2=-1.0722
  high: RMSE=191.6062, MAE=160.7418, MAPE=26.43%, R2=-1.2449
  low: RMSE=185.1903, MAE=154.5959, MAPE=26.29%, R2=-1.2232
  close: RMSE=198.5067, MAE=166.3702, MAPE=27.91%, R2=-1.5051

=== AMBUJACEM ===
Train size: 3045 Test size: 762
  open: RMSE=32.7453, MAE=19.3939, MAPE=3.16%, R2=0.8124
  high: RMSE=37.5234, MAE=23.2573, MAPE=3.74%, R2=