In [2]:
import os
import pandas as pd
from scipy.stats import pearsonr

# === INPUT PATHS ===
teacher_csv_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\teacher_csvs"
feature_path = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\Neutralized\factors_neutralized.csv"

# === OUTPUT ===
final_output_path = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\final_distillation_data_dual.csv"

# === STEP 1: CONCATENATE ALL Y_teacher FILES ===
teacher_all = []
for subfolder in os.listdir(teacher_csv_dir):
    folder_path = os.path.join(teacher_csv_dir, subfolder)
    y_path = os.path.join(folder_path, "Y_teacher.csv")
    if not os.path.exists(y_path):
        continue

    df_teacher = pd.read_csv(y_path)
    df_teacher["window"] = subfolder
    teacher_all.append(df_teacher)

teacher_df = pd.concat(teacher_all, ignore_index=True)
print(f"✅ Combined teacher data shape: {teacher_df.shape}")
print(f"Columns: {teacher_df.columns.tolist()}")

# === STEP 2: LOAD FEATURES ===
features = pd.read_csv(feature_path)
features.rename(columns={"datetime": "date", "instrument": "stock"}, inplace=True)
features["date"] = pd.to_datetime(features["date"], utc=True)
teacher_df["date"] = pd.to_datetime(teacher_df["date"], utc=True)

# === STEP 3: Alignment test ===
teacher_df["date_shifted"] = teacher_df["date"] - pd.Timedelta(days=1)

def safe_corr(a, b):
    try:
        if len(a) == 0 or len(b) == 0:
            return float('nan')
        return pearsonr(a, b)[0]
    except Exception:
        return float('nan')

merged_direct = pd.merge(features, teacher_df, on=["date", "stock"], how="inner")
merged_shifted = pd.merge(features, teacher_df,
                          left_on=["date", "stock"],
                          right_on=["date_shifted", "stock"], how="inner")

corr_reg_direct = safe_corr(merged_direct.get("Y_true_regression", []),
                            merged_direct.get("teacher_regression", []))
corr_reg_shifted = safe_corr(merged_shifted.get("Y_true_regression", []),
                             merged_shifted.get("teacher_regression", []))

print(f"\n📊 Pearson (Regression, no shift): {corr_reg_direct:.6f}")
print(f"📊 Pearson (Regression, 1-day shift): {corr_reg_shifted:.6f}")

# === STEP 4: Choose alignment ===
if abs(corr_reg_shifted) > abs(corr_reg_direct):
    print("✅ Using 1-day shifted alignment.")
    final = merged_shifted.copy()
    if "date_x" in final.columns:
        final.rename(columns={"date_x": "date"}, inplace=True)
else:
    print("✅ Using direct alignment.")
    final = merged_direct.copy()

# === STEP 5: Clean + save ===
final = final.dropna(subset=["Y_true_regression", "teacher_regression"])
final = final.sort_values(["date", "stock"]).reset_index(drop=True)

final.to_csv(final_output_path, index=False)
print(f"\n✅ Final merged dataset (regression + classification) saved to: {final_output_path}")
print(f"Final shape: {final.shape}")




✅ Combined teacher data shape: (348076, 7)
Columns: ['date', 'stock', 'Y_true_regression', 'teacher_regression', 'Y_true_classification', 'teacher_classification', 'window']

📊 Pearson (Regression, no shift): -0.022249
📊 Pearson (Regression, 1-day shift): -0.025191
✅ Using 1-day shifted alignment.

✅ Final merged dataset (regression + classification) saved to: C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\final_distillation_data_dual.csv
Final shape: (267699, 110)
