In [7]:
# data_script1_fixed_parse.py
import os
import pandas as pd

# === USER PATHS ===
base_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\splits_SP500"
teacher_output_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\output"
teacher_csv_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs"

os.makedirs(teacher_csv_dir, exist_ok=True)

def extract_class1(val):
    """Extract the 2nd number (class 1 logit/score) from strings like '[-0.07  0.08]'."""
    if not isinstance(val, str):
        return float('nan')
    val = val.strip().strip('[]').strip()
    parts = val.split()
    if len(parts) < 2:
        return float('nan')
    try:
        return float(parts[1])
    except Exception:
        return float('nan')

# === LOOP OVER ALL SP500 WINDOWS ===
for subfolder in os.listdir(base_dir):
    if not subfolder.startswith("SP500_"):
        continue

    label_path = os.path.join(base_dir, subfolder, "label.csv")
    if not os.path.exists(label_path):
        print(f"Skipping {subfolder}: label.csv not found.")
        continue

    # --- Parse date range ---
    try:
        date_part = subfolder.split("_", 1)[1]  # e.g. 2015-10-05_2018-06-03
    except IndexError:
        continue

    # --- Construct teacher output paths ---
    reg_pred_path = os.path.join(teacher_output_dir, f"Multitask_output_{date_part}", "regression", "regression_pred_last_step.csv")
    reg_true_path = os.path.join(teacher_output_dir, f"Multitask_output_{date_part}", "regression", "regression_label_last_step.csv")
    cls_pred_path = os.path.join(teacher_output_dir, f"Multitask_output_{date_part}", "classification", "classification_pred_last_step.csv")
    cls_true_path = os.path.join(teacher_output_dir, f"Multitask_output_{date_part}", "classification", "classification_label_last_step.csv")

    if not (os.path.exists(reg_pred_path) and os.path.exists(reg_true_path)
            and os.path.exists(cls_pred_path) and os.path.exists(cls_true_path)):
        print(f"⚠ Missing one or more teacher outputs for {subfolder}")
        continue

    # --- Read data ---
    df_label = pd.read_csv(label_path, index_col=0)
    df_label.index = pd.to_datetime(df_label.index)

    df_reg_pred = pd.read_csv(reg_pred_path, header=None)
    df_reg_true = pd.read_csv(reg_true_path, header=None)
    df_cls_pred_raw = pd.read_csv(cls_pred_path, header=None)
    df_cls_true = pd.read_csv(cls_true_path, header=None)

    # --- Extract teacher’s class 1 score ---
    df_cls_pred = df_cls_pred_raw.applymap(extract_class1)

    test_len = df_reg_pred.shape[0]
    df_test_dates = df_label.tail(test_len)

    if not (df_reg_pred.shape == df_reg_true.shape == df_cls_pred.shape == df_cls_true.shape == df_test_dates.shape):
        print(f"⚠ Shape mismatch for {subfolder}")
        continue

    # --- Melt to long format ---
    df_long = pd.DataFrame({
        "date": list(df_test_dates.index) * df_test_dates.shape[1],
        "stock": [col for col in df_test_dates.columns for _ in range(len(df_test_dates))],
        "Y_true_regression": df_reg_true.values.T.flatten(),
        "teacher_regression": df_reg_pred.values.T.flatten(),
        "Y_true_classification": df_cls_true.values.T.flatten(),
        "teacher_classification": df_cls_pred.values.T.flatten(),
    })

    # --- Save output ---
    out_dir = os.path.join(teacher_csv_dir, date_part)
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, "Y_teacher.csv")
    df_long.to_csv(out_path, index=False)
    print(f"✅ Saved teacher CSV: {out_path}")

print("\n=== All rolling teacher CSVs generated successfully (regression + classification, class1 logits parsed). ===")
print(f"Saved under: {teacher_csv_dir}")




  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2015-10-05_2018-06-03\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2016-06-21_2019-02-18\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2017-03-08_2019-11-05\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2017-11-23_2020-07-22\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2018-08-10_2021-04-08\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2019-04-27_2021-12-24\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2020-01-12_2022-09-10\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2020-09-28_2023-05-28\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2021-06-15_2024-02-12\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2022-03-02_2024-10-29\Y_teacher.csv


  df_cls_pred = df_cls_pred_raw.applymap(extract_class1)


✅ Saved teacher CSV: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs\2022-11-17_2025-07-16\Y_teacher.csv

=== All rolling teacher CSVs generated successfully (regression + classification, class1 logits parsed). ===
Saved under: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs
