# 0. Preliminary 

In [None]:
pip install -q numpy pandas tqdm scikit-learn fastparquet

In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, HuberRegressor

In [None]:
def expanding_window_split(
    df: pd.DataFrame,
    train_size: int,
    val_size: int,
    test_size: int,
    step_size: int,
    start_date: str | None = None,
    end_date: str | None = None,
):
    """Expanding window split for time series data.

    Args:
        df (pd.DataFrame): DataFrame containing a 'month' column in datetime format.
        train_size (int): Number of months to include in the training set.
        val_size (int): Number of months to include in the validation set.
        test_size (int): Number of months to include in the test set.
        step_size (int): Number of months to step forward for each iteration.
        start_date (str | None, optional): Start date for the data split. Defaults to None.
        end_date (str | None, optional): End date for the data split. Defaults to None.

    Raises:
        TypeError: If 'month' column is not in datetime format.

    Yields:
        train, val, test (pd.DataFrame): DataFrames containing the train, validation, and test sets.
    """
    
    # Ensure 'month' column is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df["month"]):
        raise TypeError("'month' column must be in datetime format")

    # Apply date filters if provided
    mask = pd.Series(True, index=df.index)
    
    if start_date:
        mask &= df["month"] >= pd.Timestamp(start_date)
    if end_date:
        mask &= df["month"] <= pd.Timestamp(end_date)

    months = sorted(df.loc[mask, "month"].unique())

    # Set end index
    end_idx = train_size + val_size + test_size + 1
    
    # Create a while loop to iterate until the end index exceeds the number of unique months
    while end_idx <= len(months):
        train_months = months[: end_idx - (val_size + test_size)]
        val_months   = months[end_idx - (val_size + test_size) : end_idx - test_size]
        test_months  = months[end_idx - test_size : end_idx]

        # Slice firm-month panel
        train = df[df["month"].isin(train_months)]
        val = df[df["month"].isin(val_months)]
        test = df[df["month"].isin(test_months)]

        # Stream one result at a time
        yield train, val, test

        # Expand by step_size months
        end_idx += step_size

# 1. OLS

In [None]:
# ======================================================
# 4. OLS REGRESSION FUNCTION (FIXED, SELF-CONTAINED)
# ======================================================
def OLS_regression(path, features=None, use_all_features=False, target="ret_excess", 
                   start_year=None, end_year=None, train_size=60, 
                   val_size=36, test_size=12, step_size=12):
    """
    OLS rolling expanding-window regression.
    Loads data from `path`, filters by start/end year, and returns predictions + R².
    """

    # ================== LOAD DATA (INLINE LOAD_DATA) ==================
    df = pd.read_parquet(path)

    # ensure month is datetime
    if "month" not in df.columns:
        raise KeyError("'month' column not found in data.")
    if not pd.api.types.is_datetime64_any_dtype(df["month"]):
        df["month"] = pd.to_datetime(df["month"])

    # apply year filters
    if start_year is not None:
        df = df[df["month"] >= pd.Timestamp(f"{start_year}-01-01")]
    if end_year is not None:
        df = df[df["month"] <= pd.Timestamp(f"{end_year}-12-31")]
    # ================== END LOAD DATA PART ==================

    # choose feature set
    if use_all_features:
        features = [c for c in df.columns if c not in EXCLUDE_COLS + [target]]
        print(f"Using ALL {len(features)} features.")
    elif not features:
        raise ValueError("Must specify `features` or set `use_all_features=True`.")

    df = df.dropna(subset=features + [target])
    df[features + [target]] = df[features + [target]].astype("float32")

    # generator
    splits = expanding_window_split(
        df, train_size, val_size, test_size, step_size,
        start_date=f"{start_year}-01-01" if start_year else None,
        end_date=f"{end_year}-12-31" if end_year else None
    )

    # now also keep cik, prc, shrout, mktcap_lag
    results = {
        "y_tests":    [],
        "r2_window":  {},
        "permno":     [],
        "month":      [],
        "cik":        [],
        "prc":        [],
        "shrout":     [],
        "mktcap_lag": [],
    }

    # rolling loop
    for i, (train, val, test) in enumerate(splits, 1):

        X_fit = np.concatenate([train[features].values, val[features].values])
        y_fit = np.concatenate([train[target].values, val[target].values])

        X_test = test[features].values
        y_test = test[target].values

        ols = LinearRegression(fit_intercept=True).fit(X_fit, y_fit)
        y_pred = ols.predict(X_test)

        # store meta info for later saving
        results["permno"].append(test["permno"].values)
        results["month"].append(test["month"].values)
        results["cik"].append(test["cik"].values)
        results["prc"].append(test["prc"].values)
        results["shrout"].append(test["shrout"].values)
        results["mktcap_lag"].append(test["mktcap_lag"].values)

        results["y_tests"].append((y_test, y_pred))

        label = test["month"].dt.strftime("%Y-%m").iloc[0]
        r2 = 1 - np.sum((y_test - y_pred)**2) / np.sum(y_test**2)
        results["r2_window"][f"R2.{label}"] = r2

        print(f"[{i:02d}] {label} | R²(test) = {r2:.2%}")

    # full-sample R²
    all_y = np.concatenate([y for y, _ in results["y_tests"]])
    all_pred = np.concatenate([p for _, p in results["y_tests"]])
    results["R2_full"] = 1 - np.sum((all_y - all_pred)**2) / np.sum(all_y**2)

    # save metadata
    results["features"]   = features
    results["start_year"] = start_year
    results["end_year"]   = end_year

    print(f"\n=== Full OOS R² (OLS): {results['R2_full']:.2%} ===\n")
    return results

## OLS without insider

In [None]:
# Baseline data path
path = "/work/Thesis/Data/finalized_true.parquet"

res_ols_3 = OLS_regression(
    path=path,  
    features=["char_mvel1", "char_mom12m", "char_bm"],
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
)

In [None]:
# ======================================================
# 6. SAVE OLS OUTPUTS
# ======================================================
y_tests     = res_ols_3["y_tests"]
permno_list = res_ols_3["permno"]
month_list  = res_ols_3["month"]

y_true = np.concatenate([y for (y, _) in y_tests])
y_pred = np.concatenate([yhat for (_, yhat) in y_tests])
permno = np.concatenate(permno_list)
months = pd.to_datetime(np.concatenate(month_list))

df_ols = pd.DataFrame({
    "permno": permno,
    "month": months,
    "y_true": y_true,
    "ols_y_pred": y_pred,
})

print("\n===== OLS output preview =====")
print(df_ols.head())
print("=================================\n")

df_ols.to_parquet("OLS_output.parquet", index=False)
print("Saved OLS_output.parquet")

# Save R² JSON
r2_metrics_ols = {
    "model": "OLS (baseline)",
    "R2_full": float(res_ols_3["R2_full"]),
    "R2_window": {k: float(v) for k, v in res_ols_3["r2_window"].items()},
    "start_year": res_ols_3["start_year"],
    "end_year": res_ols_3["end_year"],
    "features": res_ols_3["features"],
}

with open("OLS_R2.json", "w") as f:
    json.dump(r2_metrics_ols, f, indent=2)

print("Saved OLS_R2.json")

In [None]:
# ======================================================
# 6. SAVE OLS OUTPUTS (NN1-style columns)
# ======================================================
y_tests      = res_ols_3["y_tests"]
permno_list  = res_ols_3["permno"]
month_list   = res_ols_3["month"]
cik_list     = res_ols_3["cik"]
prc_list     = res_ols_3["prc"]
shrout_list  = res_ols_3["shrout"]
mktcap_list  = res_ols_3["mktcap_lag"]

y_true       = np.concatenate([y for (y, _) in y_tests])
y_pred       = np.concatenate([yhat for (_, yhat) in y_tests])
permno       = np.concatenate(permno_list)
months       = pd.to_datetime(np.concatenate(month_list))
cik          = np.concatenate(cik_list)
prc          = np.concatenate(prc_list)
shrout       = np.concatenate(shrout_list)
mktcap_lag   = np.concatenate(mktcap_list)

df_ols = pd.DataFrame({
    "month":           months,
    "cik":             cik,
    "permno":          permno,
    "ret_excess":      y_true,          # same name as NN1
    "prc":             prc,
    "shrout":          shrout,
    "mktcap_lag":      mktcap_lag,
    "pred_ret_excess": y_pred,          # same name as NN1
})

print("\n===== OLS output preview =====")
print(df_ols.head())
print("=================================\n")

df_ols.to_parquet("ols_output.parquet", index=False)
print("Saved ols_output.parquet")

# Save R² JSON (unchanged)
r2_metrics_ols = {
    "model": "ols (baseline)",
    "R2_full": float(res_ols_3["R2_full"]),
    "R2_window": {k: float(v) for k, v in res_ols_3["r2_window"].items()},
    "start_year": res_ols_3["start_year"],
    "end_year": res_ols_3["end_year"],
    "features": res_ols_3["features"],
}

with open("ols_R2.json", "w") as f:
    json.dump(r2_metrics_ols, f, indent=2)

print("Saved ols_R2.json")

## OLS with insider trading (Outsider)

In [None]:
# Baseline data path
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"

res_ols_outsider = OLS_regression(
    path=path,  
    features=["char_mvel1", 
    "char_mom12m",
    "char_bm",
    "is_txn_purchase_x_is_tit_ceo",
    "is_txn_purchase_x_is_tit_cfo",
    "is_txn_purchase_x_is_tit_coo",
    "is_txn_purchase_x_is_tit_director",
    "is_txn_purchase_x_is_tit_other_officer",
    "is_txn_purchase_x_is_tit_vice_president",
    "is_txn_sell_x_is_tit_ceo",
    "is_txn_sell_x_is_tit_ten_percent_owner",
    "is_opp_buy",
    "is_opp_sell",
    "is_rtn_buy",
    "is_rtn_sell",
    "is_npr_volume",
    "is_net_cluster",
    ],
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
)

In [None]:
import numpy as np
import pandas as pd
import json

# Rename for convenience
res = res_ols_outsider 

y_tests      = res["y_tests"]
permno_list  = res["permno"]
month_list   = res["month"]
cik_list     = res["cik"]
prc_list     = res["prc"]
shrout_list  = res["shrout"]
mktcap_list  = res["mktcap_lag"]

# Flatten everything
y_true      = np.concatenate([y for (y, _) in y_tests])
y_pred      = np.concatenate([yhat for (_, yhat) in y_tests])
permno      = np.concatenate(permno_list)
months      = pd.to_datetime(np.concatenate(month_list))
cik         = np.concatenate(cik_list)
prc         = np.concatenate(prc_list)
shrout      = np.concatenate(shrout_list)
mktcap_lag  = np.concatenate(mktcap_list)

# Output dataset
df_ols_outsider = pd.DataFrame({
    "month":           months,
    "cik":             cik,
    "permno":          permno,
    "ret_excess":      y_true,          
    "prc":             prc,
    "shrout":          shrout,
    "mktcap_lag":      mktcap_lag,
    "pred_ret_excess": y_pred,          
})

print("\n===== OLS + OUTSIDER output preview =====")
print(df_ols_outsider.head())
print("=========================================\n")

# Save .parquet
df_ols_outsider.to_parquet("ols_outsider_output.parquet", index=False)
print("Saved ols_outsider_output.parquet")

# ---- Save R² (full + per-window) ----
r2_metrics_outsider = {
    "model": "ols + outsider",
    "R2_full": float(res["R2_full"]),
    "R2_window": {k: float(v) for k, v in res["r2_window"].items()},
    "start_year": res["start_year"],
    "end_year": res["end_year"],
    "features": res["features"],
}

with open("ols_outsider_R2.json", "w") as f:
    json.dump(r2_metrics_outsider, f, indent=2)

print("Saved ols_outsider_R2.json")

## OLS with insider trading (Insider)

In [None]:
# Baseline data path
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"

res_ols_insider = OLS_regression(
    path=path,  
    features=["char_mvel1", 
    "char_mom12m",
    "char_bm",
    "is_txn_purchase_x_is_tit_ceo",
    "is_txn_purchase_x_is_tit_cfo",
    "is_txn_purchase_x_is_tit_coo",
    "is_txn_purchase_x_is_tit_director",
    "is_txn_purchase_x_is_tit_other_officer",
    "is_txn_purchase_x_is_tit_vice_president",
    "is_txn_sell_x_is_tit_ceo",
    "is_txn_sell_x_is_tit_ten_percent_owner",
    "is_opp_buy",
    "is_opp_sell",
    "is_rtn_buy",
    "is_rtn_sell",
    "is_npr_volume",
    "is_net_cluster",
    ],
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
)

In [None]:
import numpy as np
import pandas as pd
import json

# Rename for convenience
res = res_ols_insider

y_tests      = res["y_tests"]
permno_list  = res["permno"]
month_list   = res["month"]
cik_list     = res["cik"]
prc_list     = res["prc"]
shrout_list  = res["shrout"]
mktcap_list  = res["mktcap_lag"]

# Flatten everything
y_true      = np.concatenate([y for (y, _) in y_tests])
y_pred      = np.concatenate([yhat for (_, yhat) in y_tests])
permno      = np.concatenate(permno_list)
months      = pd.to_datetime(np.concatenate(month_list))
cik         = np.concatenate(cik_list)
prc         = np.concatenate(prc_list)
shrout      = np.concatenate(shrout_list)
mktcap_lag  = np.concatenate(mktcap_list)

# Output dataset
df_ols_insider = pd.DataFrame({
    "month":           months,
    "cik":             cik,
    "permno":          permno,
    "ret_excess":      y_true,          
    "prc":             prc,
    "shrout":          shrout,
    "mktcap_lag":      mktcap_lag,
    "pred_ret_excess": y_pred,          
})

print("\n===== OLS + INSIDER output preview =====")
print(df_ols_insider.head())
print("========================================\n")

# Save .parquet
df_ols_insider.to_parquet("ols_insider_output.parquet", index=False)
print("Saved ols_insider_output.parquet")

# ---- Save R² (full + per-window) ----
r2_metrics_insider = {
    "model": "ols + insider",
    "R2_full": float(res["R2_full"]),
    "R2_window": {k: float(v) for k, v in res["r2_window"].items()},
    "start_year": res["start_year"],
    "end_year": res["end_year"],
    "features": res["features"],
}

with open("ols_insider_R2.json", "w") as f:
    json.dump(r2_metrics_insider, f, indent=2)

print("Saved ols_insider_R2.json")