# 0. Preliminiary

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn tensorflow keras-tuner tqdm fastparquet --quiet

In [None]:
import os, gc
import numpy as np
import json
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
def expanding_window_split(
    df: pd.DataFrame,
    train_size: int,
    val_size: int,
    test_size: int,
    step_size: int,
    start_date: str | None = None,
    end_date: str | None = None,
):
    """Expanding window split for time series data.

    Args:
        df (pd.DataFrame): DataFrame containing a 'month' column in datetime format.
        train_size (int): Number of months to include in the training set.
        val_size (int): Number of months to include in the validation set.
        test_size (int): Number of months to include in the test set.
        step_size (int): Number of months to step forward for each iteration.
        start_date (str | None, optional): Start date for the data split. Defaults to None.
        end_date (str | None, optional): End date for the data split. Defaults to None.

    Raises:
        TypeError: If 'month' column is not in datetime format.

    Yields:
        train, val, test (pd.DataFrame): DataFrames containing the train, validation, and test sets.
    """
    
    # Ensure 'month' column is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df["month"]):
        raise TypeError("'month' column must be in datetime format")

    # Apply date filters if provided
    mask = pd.Series(True, index=df.index)
    
    if start_date:
        mask &= df["month"] >= pd.Timestamp(start_date)
    if end_date:
        mask &= df["month"] <= pd.Timestamp(end_date)

    months = sorted(df.loc[mask, "month"].unique())

    # Set end index
    end_idx = train_size + val_size + test_size + 1
    
    # Create a while loop to iterate until the end index exceeds the number of unique months
    while end_idx <= len(months):
        train_months = months[: end_idx - (val_size + test_size)]
        val_months   = months[end_idx - (val_size + test_size) : end_idx - test_size]
        test_months  = months[end_idx - test_size : end_idx]

        # Slice firm-month panel
        train = df[df["month"].isin(train_months)]
        val = df[df["month"].isin(val_months)]
        test = df[df["month"].isin(test_months)]

        # Stream one result at a time
        yield train, val, test

        # Expand by step_size months
        end_idx += step_size

# 2.0 Neural Network Setup

In [None]:
# ------------------------------------------------------------
# Silence TensorFlow CUDA and CPU feature logs
# ------------------------------------------------------------
import os, logging, absl.logging
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Hide INFO/WARN/ERROR logs from TensorFlow
absl.logging.set_verbosity(absl.logging.ERROR)
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [None]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt

# ===== CPU / Threading =====
os.environ["OMP_NUM_THREADS"] = "96"
os.environ["OPENBLAS_NUM_THREADS"] = "96"
os.environ["MKL_NUM_THREADS"] = "96"
os.environ["VECLIB_MAXIMUM_THREADS"] = "96"
os.environ["NUMEXPR_NUM_THREADS"] = "96"
os.environ["TF_NUM_INTRAOP_THREADS"] = "96"
os.environ["TF_NUM_INTEROP_THREADS"] = "8"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

tf.config.threading.set_intra_op_parallelism_threads(96)
tf.config.threading.set_inter_op_parallelism_threads(8)

# ===== Global hyperparameters =====
BEST_L1_GLOBAL = None
BEST_LR_GLOBAL = None


def neural_network(
    df,
    target,
    features=None,
    use_all_features=False,
    start_year=None,
    end_year=None,
    train_size=2,
    val_size=2,
    test_size=1,
    step_size=1,
    epochs=50,
    batch_size=128,
    lr=[0.0001, 0.001, 0.01, 0.1],
    l1_reg=[1e-5, 1e-4, 1e-3, 1e-2],
    hidden_depth=3,
    base_width=32,
    ensemble=10,
):
    """
    Rolling expanding-window NN with monitoring for losses and tuning.
    """
    global BEST_L1_GLOBAL, BEST_LR_GLOBAL

    # ---------- Prepare df ----------
    df = df.copy()
    df["month"] = pd.to_datetime(df["month"], errors="coerce")

    # Year filter
    if (start_year is not None) and (end_year is not None):
        df = df[
            (df["month"].dt.year >= start_year)
            & (df["month"].dt.year <= end_year)
        ].copy()
        print(f"Using data from {start_year} to {end_year}")

    # ---------- Features ----------
    if use_all_features:
        features = [
            c
            for c in df.columns
            if c not in [
                "month",
                "cik",
                "permno",
                target,
                "prc",
                "shrout",
                "mktcap",
            ]
        ]
        print(f"Using all {len(features)} features.")
    elif not features:
        raise ValueError("Specify 'features' or set use_all_features=True.")

    df[features] = df[features].astype("float32", copy=False)
    df[target]   = df[target].astype("float32", copy=False)

    # ---------- Architecture ----------
    def make_layers(depth, base=base_width):
        return [max(2, base // (2**i)) for i in range(depth)]

    layer_config = make_layers(hidden_depth)
    print(f"NN{hidden_depth} architecture: {layer_config}")

    # ---------- Model builder for tuner ----------
    def build_model(hp):
        model = Sequential()
        model.add(Input(shape=(len(features),)))
        for units in layer_config:
            model.add(
                Dense(
                    units,
                    activation="relu",
                    kernel_regularizer=L1(hp.Choice("l1_reg", l1_reg)),
                    use_bias=False
                )
            )
            model.add(BatchNormalization())
        model.add(Dense(1, activation="linear"))
        model.compile(
            optimizer=Adam(learning_rate=hp.Choice("lr", lr)),
            loss="mse",
        )
        return model

    # ---------- Containers ----------
    y_fits, y_tests, dates, X_tests = [], [], [], []
    dic_r2_all = {}
    ensemble_weights = []
    
    # Containers for monitoring
    window_histories = {} 
    tuning_results = []   

    last_trval = None

    # ---------- Rolling loop ----------
    for window_id, (train, val, test) in enumerate(
        tqdm(
            expanding_window_split(
                df=df,
                train_size=train_size,
                val_size=val_size,
                test_size=test_size,
                step_size=step_size,
                start_date=f"{start_year}-01-01" if start_year else None,
                end_date=f"{end_year}-12-31" if end_year else None,
            ),
            desc=f"Rolling Neural Network NN{hidden_depth} windows",
            unit="window",
        ),
        start=1,
    ):
        if test.empty:
            continue

        # ----- Split arrays -----
        X_train, y_train = (
            train[features].to_numpy("float32"),
            train[target].to_numpy("float32"),
        )
        X_val, y_val = (
            val[features].to_numpy("float32"),
            val[target].to_numpy("float32"),
        )
        X_test, y_test = (
            test[features].to_numpy("float32"),
            test[target].to_numpy("float32"),
        )

        # >>> CHANGED: keep extra columns for saving later <<<
        cols_for_output = [
            "permno",
            "month",
            "cik",
            "ret_excess",
            "prc",
            "shrout",
            "mktcap_lag",
            "mktcap",
            "macro_dp",
            "macro_ep",
            "macro_bm",
            "macro_ntis",
            "macro_tbl",
            "macro_tms",
            "macro_dfy",
            "macro_svar",
        ] + features
        cols_for_output = [c for c in cols_for_output if c in test.columns]
        X_tests.append(test[cols_for_output].copy())

        # ----- Hyperparameter tuning on first window only -----
        if (BEST_L1_GLOBAL is None) or (BEST_LR_GLOBAL is None):
            print(f"\n--- Tuning Hyperparameters on Window {window_id} ---")
            train_ds = (
                tf.data.Dataset.from_tensor_slices((X_train, y_train))
                .batch(batch_size)
                .prefetch(tf.data.AUTOTUNE)
            )
            val_ds = (
                tf.data.Dataset.from_tensor_slices((X_val, y_val))
                .batch(batch_size)
                .prefetch(tf.data.AUTOTUNE)
            )

            tuner = kt.GridSearch(
                build_model,
                objective="val_loss",
                max_trials=len(l1_reg) * len(lr),
                directory="tuning_logs",
                project_name=f"NN_tune_window_{window_id}",
                overwrite=True
            )
            
            tuner.search(
                train_ds,
                validation_data=val_ds,
                epochs=min(epochs, 50),
                verbose=0,
            )

            # Capture all trials to see how parameters were chosen
            print("Tuning Results (Val Loss):")
            for trial in tuner.oracle.get_best_trials(num_trials=len(l1_reg) * len(lr)):
                hps = trial.hyperparameters.values
                score = trial.score
                print(f"  LR: {hps['lr']:.4f}, L1: {hps['l1_reg']:.5f} -> Loss: {score:.5f}")
                tuning_results.append({
                    "lr": hps['lr'], 
                    "l1_reg": hps['l1_reg'], 
                    "val_loss": score,
                    "window": window_id
                })

            best_hp = tuner.get_best_hyperparameters(1)[0]
            BEST_L1_GLOBAL = best_hp.get("l1_reg")
            BEST_LR_GLOBAL = best_hp.get("lr")
            print(f"Selected Best: λ₁={BEST_L1_GLOBAL}, LR={BEST_LR_GLOBAL}\n")

        # ----- Wrap best HPs -----
        class _BestHPWrapper:
            def get(self, key):
                if key == "l1_reg": return BEST_L1_GLOBAL
                if key == "lr": return BEST_LR_GLOBAL
                raise KeyError(key)

        best_hp = _BestHPWrapper()

        # ----- Combine TRAIN + VALIDATION -----
        X_trval = np.concatenate([X_train, X_val], axis=0)
        y_trval = np.concatenate([y_train, y_val], axis=0)
        last_trval = (X_trval, y_trval)

        # ----- Ensemble training -----
        def train_one(seed, X_trval, y_trval, X_test, best_hp):
            import tensorflow as tf
            from tensorflow.keras import Sequential
            from tensorflow.keras.layers import Dense, BatchNormalization, Input
            from tensorflow.keras.optimizers import Adam
            from tensorflow.keras.regularizers import L1
            from tensorflow.keras.callbacks import EarlyStopping
            import gc as _gc

            tf.keras.utils.set_random_seed(seed)

            model = Sequential()
            model.add(Input(shape=(X_trval.shape[1],)))
            for units in layer_config:
                model.add(
                    Dense(
                        units,
                        activation="relu",
                        kernel_regularizer=L1(best_hp.get("l1_reg")),
                        use_bias=False
                    )
                )
                model.add(BatchNormalization())
            model.add(Dense(1, activation="linear"))
            model.compile(
                optimizer=Adam(learning_rate=best_hp.get("lr")),
                loss="mse",
            )

            early_stop = EarlyStopping(
                monitor="val_loss",
                patience=5,
                restore_best_weights=True,
            )

            # Capture history object
            history = model.fit(
                X_trval,
                y_trval,
                epochs=epochs,
                batch_size=batch_size,
                validation_split=0.05,
                callbacks=[early_stop],
                verbose=0,
            )

            pred_fit = model.predict(X_trval, verbose=0).flatten()
            pred_test = model.predict(X_test, verbose=0).flatten()
            weights = model.get_weights()

            tf.keras.backend.clear_session()
            _gc.collect()

            return pred_fit, pred_test, weights, history.history

        results = Parallel(
            n_jobs=min(ensemble, 24),
            backend="loky",
        )(
            delayed(train_one)(12308 + e, X_trval, y_trval, X_test, best_hp)
            for e in range(ensemble)
        )

        preds_fit_ensemble, preds_test_ensemble, weights_ensemble, histories_ensemble = zip(*results)
        
        # Save histories for this window
        label = pd.to_datetime(test["month"]).dt.strftime("%Y-%m").iloc[0]
        window_histories[label] = histories_ensemble

        # ----- Average predictions & weights -----
        y_pred_fit = np.mean(preds_fit_ensemble, axis=0)
        y_pred_test = np.mean(preds_test_ensemble, axis=0)
        avg_weights = [
            np.mean(np.stack(layer_w, axis=0), axis=0)
            for layer_w in zip(*weights_ensemble)
        ]
        ensemble_weights.append(avg_weights)

        # ----- Evaluate -----
        y_fits.append((y_trval, y_pred_fit))
        y_tests.append((y_test, y_pred_test))

        dates.append(label)

        ss_res = np.sum((y_test - y_pred_test) ** 2)
        ss_tot = np.sum(y_test ** 2)
        r2 = 1 - ss_res / ss_tot
        dic_r2_all[f"r2.{label}"] = r2

        print(f"[NN{hidden_depth}] Test period: {label} | Ensemble R² = {r2:.4f}")

        tf.keras.backend.clear_session()
        gc.collect()

    # ---------- Full-sample OOS R² ----------
    y_test_all = np.concatenate([y for (y, _) in y_tests]) if y_tests else np.array([])
    y_pred_all = np.concatenate([yhat for (_, yhat) in y_tests]) if y_tests else np.array([])

    R2OOS_Full = (
        1 - np.sum((y_test_all - y_pred_all) ** 2) / np.sum(y_test_all ** 2)
        if y_test_all.size > 0
        else np.nan
    )

    print("\n===========================================")
    print(f"Full Out-of-Sample R²: {R2OOS_Full:.4f}")
    print("===========================================\n")

    # =========================================================
    # [FIX] Train 'last_model' on the final window for saving
    # =========================================================
    print("Training final model on the last window (for saving)...")
    last_model = Sequential()
    last_model.add(Input(shape=(len(features),)))
    for units in layer_config:
        last_model.add(
            Dense(
                units,
                activation="relu",
                kernel_regularizer=L1(BEST_L1_GLOBAL),
                use_bias=False
            )
        )
        last_model.add(BatchNormalization())
    last_model.add(Dense(1, activation="linear"))
    
    last_model.compile(
        optimizer=Adam(learning_rate=BEST_LR_GLOBAL),
        loss="mse",
    )

    if last_trval is not None:
        X_trval_last, y_trval_last = last_trval
        last_model.fit(
            X_trval_last,
            y_trval_last,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.05,
            callbacks=[
                EarlyStopping(
                    monitor="val_loss",
                    patience=5,
                    restore_best_weights=True,
                )
            ],
            verbose=0,
        )

    # [FIX] Added 'model': last_model to return dictionary
    return {
        "R2_window": dic_r2_all,
        "R2_full": R2OOS_Full,
        "y_fits": y_fits,
        "y_tests": y_tests,
        "X_tests": X_tests,
        "dates": dates,
        "ensemble_weights": ensemble_weights,
        "tuning_results": tuning_results,  
        "window_histories": window_histories,
        "model": last_model 
    }

# Neural Network Without Insider Trading

## 1.1 NN1

In [None]:
# ======================================================
# 1. Load Feature List from Text File
# ======================================================
# Updated path with correct extension (.txt) and prefix
feature_file = "/work/Thesis/Data/gkx_cols.txt"

with open(feature_file, "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(baseline)}")

# ======================================================
# 2. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/finalized_true.parquet"
df = pd.read_parquet(path)

res_nn1 = neural_network(
    df=df,
    features=baseline,        
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=1,
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn1["y_tests"]))
x_all = pd.concat(res_nn1["X_tests"], ignore_index=True)

# >>> CHANGED: output columns & names for nn1_output.parquet <<<
pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn1_output.parquet", index=False)

# save model & weights
res_nn1["model"].save("nn1_model.keras")

np.savez_compressed(
    "nn1_ensemble_weights.npz", 
    weights=np.array(res_nn1["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn1_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn1["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn1["R2_window"].items()}
    }, f, indent=2)

print("saved: nn1_output.parquet, nn1_model.keras, weights, and r2 json.")

## 1.2 NN2

In [None]:
# ======================================================
# 1. Load Feature List from Text File
# ======================================================
# Updated path with correct extension (.txt) and prefix
feature_file = "/work/Thesis/Data/gkx_cols.txt"

with open(feature_file, "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(baseline)}")

# ======================================================
# 2. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/finalized_true.parquet"
df = pd.read_parquet(path)

res_nn2 = neural_network(
    df=df,
    features=baseline,        
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=2,
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn2["y_tests"]))
x_all = pd.concat(res_nn2["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn2_output.parquet", index=False)

# save model & weights
res_nn2["model"].save("nn2_model.keras")

np.savez_compressed(
    "nn2_ensemble_weights.npz", 
    weights=np.array(res_nn2["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn2_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn2["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn2["R2_window"].items()}
    }, f, indent=2)

print("saved: nn2_output.parquet, nn2_model.keras, weights, and r2 json.")

## 1.3 NN3

In [None]:
# ======================================================
# 1. Load Feature List from Text File
# ======================================================
# Updated path with correct extension (.txt) and prefix
feature_file = "/work/Thesis/Data/gkx_cols.txt"

with open(feature_file, "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(baseline)}")

# ======================================================
# 2. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/finalized_true.parquet"
df = pd.read_parquet(path)

res_nn3 = neural_network(
    df=df,
    features=baseline,        
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=3,
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn3["y_tests"]))
x_all = pd.concat(res_nn3["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn3_output.parquet", index=False)

# save model & weights
res_nn3["model"].save("nn3_model.keras")

np.savez_compressed(
    "nn3_ensemble_weights.npz", 
    weights=np.array(res_nn3["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn3_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn3["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn3["R2_window"].items()}
    }, f, indent=2)

print("saved: nn3_output.parquet, nn3_model.keras, weights, and r2 json.")

## 1.4 NN4

In [None]:
# ======================================================
# 1. Load Feature List from Text File
# ======================================================
# Updated path with correct extension (.txt) and prefix
feature_file = "/work/Thesis/Data/gkx_cols.txt"

with open(feature_file, "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(baseline)}")

# ======================================================
# 2. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/finalized_true.parquet"
df = pd.read_parquet(path)

res_nn4 = neural_network(
    df=df,
    features=baseline,        
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=4,
    ensemble=10
)

In [None]:
# ======================================================
# 1. Flatten & Save Outputs
# ======================================================
y_true, y_pred = map(np.concatenate, zip(*res_nn4["y_tests"]))
x_all = pd.concat(res_nn4["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn4_output.parquet", index=False)


# ======================================================
# 2. Save NN4 Model
# ======================================================
res_nn4["model"].save("nn4_model.keras")


# ======================================================
# 3. Save Ensemble Weights (ROBUST VERSION)
#    - one key per ensemble member: w_0, w_1, ...
# ======================================================
ensemble = res_nn4["ensemble_weights"]   # list of length n_ens; each item = list of 22 arrays

weights_dict = {
    f"w_{i}": np.array(w, dtype=object)  # store each model's 22 tensors as an object array
    for i, w in enumerate(ensemble)
}

np.savez_compressed(
    "nn4_ensemble_weights.npz",
    **weights_dict,
    allow_pickle=True
)

print(f"Saved {len(ensemble)} ensemble members, each with {len(ensemble[0])} weight tensors.")


# ======================================================
# 4. Save R² metrics
# ======================================================
with open("nn4_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn4["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn4["R2_window"].items()}
    }, f, indent=2)

print("Saved: nn4_output.parquet, nn4_model.keras, nn4_ensemble_weights.npz, nn4_r2.json")

## 1.5 NN5

In [None]:
# ======================================================
# 1. Load Feature List from Text File
# ======================================================
# Updated path with correct extension (.txt) and prefix
feature_file = "/work/Thesis/Data/gkx_cols.txt"

with open(feature_file, "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

print(f"Successfully loaded {len(baseline)}")

# ======================================================
# 2. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/finalized.parquet"
df = pd.read_parquet(path)

res_nn5 = neural_network(
    df=df,
    features=baseline,        
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=5,
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn5["y_tests"]))
x_all = pd.concat(res_nn5["X_tests"], ignore_index=True)

pd.DataFrame({
    "permno": x_all["permno"].values,
    "month":  pd.to_datetime(x_all["month"]),
    "y_true": y_true,
    "nn5_y_pred": y_pred
}).to_parquet("nn5_output.parquet", index=False)

# save model & weights
res_nn5["model"].save("nn5_model.keras")

np.savez_compressed(
    "nn5_ensemble_weights.npz", 
    weights=np.array(res_nn5["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn5_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn5["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn5["R2_window"].items()}
    }, f, indent=2)

print("saved: nn5_output.parquet, nn5_model.keras, weights, and r2 json.")

# Neural Network With Insider Trading (Insider)

## 2.1 NN1

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/3. Insider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
df = pd.read_parquet(path)

res_nn1_insider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],          
    hidden_depth=1,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn1_insider["y_tests"]))
x_all = pd.concat(res_nn1_insider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn1_insider_output.parquet", index=False)

# save model & weights
res_nn1_insider["model"].save("nn1_insider_model.keras")

np.savez_compressed(
    "nn1_insider_ensemble_weights.npz", 
    weights=np.array(res_nn1_insider["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn1_insider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn1_insider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn1_insider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn1_insider_output.parquet, nn1_insider_model.keras, weights, and r2 json.")

## 2.2 NN2

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/3. Insider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
df = pd.read_parquet(path)

res_nn2_insider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],          
    hidden_depth=2,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn2_insider["y_tests"]))
x_all = pd.concat(res_nn2_insider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn2_insider_output.parquet", index=False)

# save model & weights
res_nn2_insider["model"].save("nn2_insider_model.keras")

np.savez_compressed(
    "nn2_insider_ensemble_weights.npz", 
    weights=np.array(res_nn2_insider["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn2_insider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn2_insider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn2_insider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn2_insider_output.parquet, nn2_insider_model.keras, weights, and r2 json.")

## 2.3 NN3

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/3. Insider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
df = pd.read_parquet(path)

res_nn3_insider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],         
    hidden_depth=3,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

In [None]:
# flatten & save data
y_true, y_pred = map(np.concatenate, zip(*res_nn3_insider["y_tests"]))
x_all = pd.concat(res_nn3_insider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn3_insider_output.parquet", index=False)

# save model & weights
res_nn3_insider["model"].save("nn3_insider_model.keras")

np.savez_compressed(
    "nn3_insider_ensemble_weights.npz", 
    weights=np.array(res_nn3_insider["ensemble_weights"], dtype=object), 
    allow_pickle=True
)

# save r2 metrics
with open("nn3_insider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn3_insider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn3_insider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn3_insider_output.parquet, nn3_insider_model.keras, weights, and r2 json.")

## 2.4 NN4

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/3. Insider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
df = pd.read_parquet(path)

res_nn4_insider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],         
    hidden_depth=4,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

In [None]:
# NEW NEW ===================== NN4 (insider, non-lag): flatten & save data =====================
y_true, y_pred = map(np.concatenate, zip(*res_nn4_insider["y_tests"]))
x_all = pd.concat(res_nn4_insider["X_tests"], ignore_index=True)

df_out = pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"].values,
    "permno":          x_all["permno"].values,
    "ret_excess":      y_true,
    "prc":             x_all["prc"].values,
    "shrout":          x_all["shrout"].values,
    "mktcap_lag":      x_all["mktcap_lag"].values,
    "pred_ret_excess": y_pred
}).to_parquet("nn4_insider_output.parquet", index=False)

# ===================== Save model =====================
res_nn4_insider["model"].save("nn4_insider_model.keras")

# ===================== Save ensemble weights (simple version) =====================
np.savez_compressed(
    "nn4_insider_ensemble_weights.npz",
    weights=np.array(res_nn4_insider["ensemble_weights"], dtype=object),
    allow_pickle=True
)

# ===================== Save R² metrics =====================
import json
with open("nn4_insider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn4_insider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn4_insider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn4_insider_output.parquet, nn4_insider_model.keras, weights, and r2 json.")

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/3. Insider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
df = pd.read_parquet(path)

res_nn4_insider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],         
    hidden_depth=5,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

# Neural Networks with insider trading (Outsider)

## 2.1 NN1

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/2. Outsider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
df = pd.read_parquet(path)

# ======================================================
# 4. Run Neural Networks: NN1, NN2, NN3, NN4 (insider, non-lag)
# ======================================================

# NN1 – shallow network
res_nn1_outsider = neural_network(
    df=df,
    features=combined_features,
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=1,        # <--- NN1
    ensemble=10
)

In [None]:
# ===================== NN1: flatten & save data =====================
y_true, y_pred = map(np.concatenate, zip(*res_nn1_outsider["y_tests"]))
x_all = pd.concat(res_nn1_outsider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn1_outsider_output.parquet", index=False)

# save model & weights
res_nn1_outsider["model"].save("nn1_outsider_model.keras")

np.savez_compressed(
    "nn1_outsider_ensemble_weights.npz",
    weights=np.array(res_nn1_outsider["ensemble_weights"], dtype=object),
    allow_pickle=True
)

# save r2 metrics
with open("nn1_outsider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn1_outsider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn1_outsider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn1_outsider_output.parquet, nn1_outsider_model.keras, weights, and r2 json.")

## 3.2 NN2

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/2. Outsider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
df = pd.read_parquet(path)

# NN2 – two hidden layers
res_nn2_outsider = neural_network(
    df=df,
    features=combined_features,
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=2,        # <--- NN2
    ensemble=10
)

In [None]:
# ===================== NN2: flatten & save data =====================
y_true, y_pred = map(np.concatenate, zip(*res_nn2_outsider["y_tests"]))
x_all = pd.concat(res_nn2_outsider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn2_outsider_output.parquet", index=False)

# save model & weights
res_nn2_outsider["model"].save("nn2_outsider_model.keras")

np.savez_compressed(
    "nn2_outsider_ensemble_weights.npz",
    weights=np.array(res_nn2_outsider["ensemble_weights"], dtype=object),
    allow_pickle=True
)

# save r2 metrics
with open("nn2_outsider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn2_outsider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn2_outsider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn2_outsider_output.parquet, nn2_outsider_model.keras, weights, and r2 json.")

## 3.3 NN3

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/2. Outsider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
df = pd.read_parquet(path)

# NN3 – three hidden layers
res_nn3_outsider = neural_network(
    df=df,
    features=combined_features,
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],
    l1_reg=[1e-5, 1e-3],
    hidden_depth=3,        # <--- NN3
    ensemble=10
)

In [None]:
# ===================== NN2: flatten & save data =====================
y_true, y_pred = map(np.concatenate, zip(*res_nn3_outsider["y_tests"]))
x_all = pd.concat(res_nn3_outsider["X_tests"], ignore_index=True)

pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"],
    "permno":          x_all["permno"],
    "ret_excess":      y_true,
    "prc":             x_all["prc"],
    "shrout":          x_all["shrout"],
    "mktcap_lag":      x_all["mktcap_lag"],
    "pred_ret_excess": y_pred,
}).to_parquet("nn3_outsider_output.parquet", index=False)

# save model & weights
res_nn3_outsider["model"].save("nn3_outsider_model.keras")

np.savez_compressed(
    "nn3_outsider_ensemble_weights.npz",
    weights=np.array(res_nn3_outsider["ensemble_weights"], dtype=object),
    allow_pickle=True
)

# save r2 metrics
with open("nn3_outsider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn3_outsider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn3_outsider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn3_outsider_output.parquet, nn3_outsider_model.keras, weights, and r2 json.")

## 3.4 NN4

In [None]:
# ======================================================
# 1. Load Feature Lists from Text Files
# ======================================================
base_path = "/work/Thesis/Data/2. Outsider/"

# A. Load the Baseline (GKX) columns
with open(base_path + "gkx_cols.txt", "r") as f:
    baseline = [line.strip() for line in f if line.strip()]

# B. Load the Reduced Insider columns
with open(base_path + "insider_cols.txt", "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# ======================================================
# 2. Combine the Lists
# ======================================================
combined_features = baseline + insider_cols

print(f"Baseline features: {len(baseline)}")
print(f"Insider features:  {len(insider_cols)}")
print(f"Total input size:  {len(combined_features)}")

# ======================================================
# 4. Run Neural Network
# ======================================================
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
df = pd.read_parquet(path)

res_nn4_outsider = neural_network(
    df=df,
    features=combined_features,   # <--- Uses the combined list
    use_all_features=False,
    target="ret_excess",
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    epochs=100,
    batch_size=10000,
    lr=[0.001, 0.01],             # Expand ranges if you have time!
    l1_reg=[1e-5, 1e-3],         
    hidden_depth=4,               # Consider changing to 2 or 3 for interactions
    ensemble=10
)

In [None]:
# ===================== NN4 (insider, non-lag): flatten & save data =====================
y_true, y_pred = map(np.concatenate, zip(*res_nn4_outsider["y_tests"]))
x_all = pd.concat(res_nn4_outsider["X_tests"], ignore_index=True)

df_out = pd.DataFrame({
    "month":           pd.to_datetime(x_all["month"]),
    "cik":             x_all["cik"].values,
    "permno":          x_all["permno"].values,
    "ret_excess":      y_true,
    "prc":             x_all["prc"].values,
    "shrout":          x_all["shrout"].values,
    "mktcap_lag":      x_all["mktcap_lag"].values,
    "pred_ret_excess": y_pred
}).to_parquet("nn4_outsider_output.parquet", index=False)

# ===================== Save model =====================
res_nn4_outsider["model"].save("nn4_outsider_model.keras")

# ===================== Save ensemble weights (simple version) =====================
np.savez_compressed(
    "nn4_outsider_ensemble_weights.npz",
    weights=np.array(res_nn4_outsider["ensemble_weights"], dtype=object),
    allow_pickle=True
)

# ===================== Save R² metrics =====================
import json
with open("nn4_outsider_r2.json", "w") as f:
    json.dump({
        "r2_full": float(res_nn4_outsider["R2_full"]),
        "r2_window": {k: float(v) for k, v in res_nn4_outsider["R2_window"].items()}
    }, f, indent=2)

print("saved: nn4_outsider_output.parquet, nn4_outsider_model.keras, weights, and r2 json.")

# Variable Importance

In [None]:
# ==============================================================
# BLOCK-WISE VARIABLE IMPORTANCE FOR NN4 (BASELINE + INTERACTIONS)
# ==============================================================

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_CPP_MIN_LOG_LOG_LEVEL"] = "2"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import numpy as np
import pandas as pd
import tensorflow as tf
import joblib
from joblib import Parallel, delayed


# ==============================================================
# GKX R² FUNCTION
# ==============================================================

def r2_gkx(y_true, y_pred):
    """
    GKX-style R²: 1 - sum(e²) / sum(y²).
    Uses raw y (no demeaning) in the denominator.
    """
    y_true = np.asarray(y_true, dtype="float64")
    y_pred = np.asarray(y_pred, dtype="float64")
    sse = np.sum((y_true - y_pred) ** 2)
    sst = np.sum(y_true ** 2)
    if sst == 0:
        return np.nan
    return 1 - sse / sst

# ==============================================================
# WORKER: ΔR² FOR ONE BASELINE FEATURE (MAIN + INTERACTIONS)
# ==============================================================

def _compute_feature_drop(
    feat, idx_list, X_train, y_train, R2_full, SST_w,
    window_id, nn_model_path, window_weights
):
    """
    Compute ΔR² for ONE baseline feature in ONE window,
    zeroing BOTH the main effect and all its interaction columns.
    """

    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    os.environ["TF_CPP_MIN_LOG_LOG_LEVEL"] = "3"

    import tensorflow as tf
    import numpy as np

    model = tf.keras.models.load_model(nn_model_path, compile=False)
    model.set_weights(window_weights.tolist())

    X_mod = X_train.copy()
    X_mod[:, idx_list] = 0.0  # zero ALL related cols

    y_hat_zero = model.predict(X_mod, verbose=0).ravel()
    R2_zero = r2_gkx(y_train, y_hat_zero)

    dR2 = R2_full - R2_zero

    return {
        "window": window_id,
        "feature": feat,
        "R2_baseline": R2_full,
        "R2_removed": R2_zero,
        "R2_drop": dR2,
        "SST": SST_w,
        "n_obs": len(y_train),
    }


# ==============================================================
# MAIN FUNCTION — BLOCK-WISE VI (BASELINE + INTERACTIONS)
# ==============================================================

def variable_importance(
    df_path,
    target,
    features_vi,
    start_year,
    end_year,
    train_size,
    val_size,
    test_size,
    step_size,
    nn_model_path=None,
    nn_weights_path=None,
    max_rows_per_window=None,
    random_state=None,
    n_jobs=None,
    verbose=True,
):
    """
    Block-wise variable importance for baseline characteristics.

    For each baseline feature f in features_vi:
    - Identify block B_f = [f] + all interaction columns 'inter_f_x_*'
    - Zero ALL columns in B_f and recompute R² (train-only) for each window.
    - ΔR²_f = drop in R² when B_f is zeroed.
    - Aggregate across windows with SST weights, and normalize to VI weights
      ONLY across the baseline features (not all 915 inputs).
    """

    rng = np.random.default_rng(random_state)

    # ----------------------------------------------------------
    # 1) Load data and filter time
    # ----------------------------------------------------------
    df = pd.read_parquet(df_path).copy()
    df["month"] = pd.to_datetime(df["month"])

    df = df[
        (df["month"].dt.year >= start_year) &
        (df["month"].dt.year <= end_year)
    ].copy()

    # Non-feature columns
    drop_cols = ["month", "year", "cik", "permno", target,
                 "prc", "shrout", "mktcap_lag"]
    drop_cols = [c for c in drop_cols if c in df.columns]

    # Candidate features in file order
    features_all = [c for c in df.columns if c not in drop_cols]

    # ----------------------------------------------------------
    # 2) Load NN4 + weights and align input dimension (915)
    # ----------------------------------------------------------
    base_model = tf.keras.models.load_model(nn_model_path, compile=False)
    weights_arr = np.load(nn_weights_path, allow_pickle=True)["weights"]

    input_dim = base_model.input_shape[1]  # should be 915 for NN4

    if len(features_all) < input_dim:
        raise ValueError(
            f"Only {len(features_all)} feature columns, but model expects {input_dim}"
        )
    if len(features_all) > input_dim:
        # Keep only first 915 columns (matching training design)
        features_all = features_all[:input_dim]

    # ----------------------------------------------------------
    # 3) Build block mapping: baseline feature -> [main + interactions]
    # ----------------------------------------------------------
    # Keep only baseline features present in features_all
    features_vi = [f for f in features_vi if f in features_all]
    if len(features_vi) == 0:
        raise ValueError("None of the requested VI baseline features are in features_all.")

    drop_index_map = {}

    for f in features_vi:
        related_cols = []

        # main effect
        if f in features_all:
            related_cols.append(f)

        # interactions: inter_<f>_x_*
        prefix = f"inter_{f}_x_"
        for col in features_all:
            if col.startswith(prefix):
                related_cols.append(col)

        if not related_cols:
            # Fallback: zero at least main effect
            related_cols = [f]

        idx_list = [features_all.index(col) for col in related_cols]
        drop_index_map[f] = idx_list

        if verbose:
            print(f"[Block] {f}: {len(idx_list)} columns (main + interactions)")

    if n_jobs is None:
        n_jobs = joblib.cpu_count()

    rows_window = []
    sst_total = 0.0

    # ----------------------------------------------------------
    # 4) Rolling windows (train-only for VI)
    # ----------------------------------------------------------
    for window_id, (train, val, test) in enumerate(
        expanding_window_split(
            df=df,
            train_size=train_size,
            val_size=val_size,
            test_size=test_size,
            step_size=step_size,
            start_date=f"{start_year}-01-01",
            end_date=f"{end_year}-12-31",
        ),
        start=0,
    ):

        if window_id >= len(weights_arr):
            break
        if train.empty:
            continue

        # ----- TRAIN ONLY -----
        X_train_df = train[features_all]
        y_train = train[target].to_numpy("float32")

        # Optional row subsampling for safety
        if max_rows_per_window and len(X_train_df) > max_rows_per_window:
            idx = rng.choice(len(X_train_df), max_rows_per_window, replace=False)
            X_train_df = X_train_df.iloc[idx]
            y_train = y_train[idx]

        X_train = X_train_df.to_numpy("float32")

        # Set window weights once
        base_model.set_weights(weights_arr[window_id].tolist())
        y_hat_full = base_model.predict(X_train, verbose=0).ravel()

        R2_full = r2_gkx(y_train, y_hat_full)
        SST_w = float(np.sum(y_train**2))

        if np.isnan(R2_full) or SST_w == 0:
            if verbose:
                print(f"[Window {window_id}] skipped (NaN R² or zero SST).")
            continue

        sst_total += SST_w

        if verbose:
            print(f"[Window {window_id}] R²_full(train) = {R2_full:.4f}, n={len(y_train)}")

        # ------------------------------------------------------
        # 5) Parallel block-wise feature removal
        # ------------------------------------------------------
        parallel_out = Parallel(n_jobs=n_jobs, backend="loky")(
            delayed(_compute_feature_drop)(
                feat,
                drop_index_map[feat],   # main + interactions
                X_train,
                y_train,
                R2_full,
                SST_w,
                window_id,
                nn_model_path,
                weights_arr[window_id],
            )
            for feat in features_vi
        )

        rows_window.extend(parallel_out)

    # ----------------------------------------------------------
    # 6) Aggregate across windows — ONLY TRUE REDUCTIONS
    # ----------------------------------------------------------
    dfw = pd.DataFrame(rows_window)

    if dfw.empty:
        raise ValueError("No VI rows computed — check filters / windows / data.")

    dfw["True_Reduction"] = dfw["R2_drop"].clip(lower=0)

    num = dfw.groupby("feature").apply(
        lambda g: np.sum(g["SST"] * g["True_Reduction"])
    )

    R2_reduction_global = num / sst_total

    # Normalize ONLY across the baseline features (the 94 variables)
    R2_pos = np.clip(R2_reduction_global, 0, None)
    VI_weight = R2_pos / R2_pos.sum()

    vi_global = pd.DataFrame({
        "feature": num.index,
        "R2_reduction_global": R2_reduction_global.values,
        "VI_weight": VI_weight.values,
    }).sort_values("R2_reduction_global", ascending=False)

    # Optionally show globally excluded vars (never reduce R²)
    tol = 1e-12
    excluded = vi_global.loc[
        vi_global["R2_reduction_global"] <= tol
    ]["feature"].tolist()

    print("\n==============================================")
    print(" VARIABLES EXCLUDED GLOBALLY (never reduce R²)")
    print("==============================================")
    for v in excluded:
        print("  -", v)
    print("==============================================\n")

    return vi_global, dfw


# NN4

## Without insider trading

In [None]:
# ==============================================================
# BASELINE PREDICTORS (94) — ONE BLOCK EACH
# ==============================================================

baseline_predictors = [
    'char_mvel1',
    'char_beta',
    'char_betasq',
    'char_chmom',
    'char_dolvol',
    'char_idiovol',
    'char_indmom',
    'char_mom1m',
    'char_mom6m',
    'char_mom12m',
    'char_mom36m',
    'char_pricedelay',
    'char_turn',
    'char_absacc',
    'char_acc',
    'char_age',
    'char_agr',
    'char_bm',
    'char_bm_ia',
    'char_cashdebt',
    'char_cashpr',
    'char_cfp',
    'char_cfp_ia',
    'char_chatoia',
    'char_chcsho',
    'char_chempia',
    'char_chinv',
    'char_chpmia',
    'char_convind',
    'char_currat',
    'char_depr',
    'char_divi',
    'char_divo',
    'char_dy',
    'char_egr',
    'char_ep',
    'char_gma',
    'char_grcapx',
    'char_grltnoa',
    'char_herf',
    'char_hire',
    'char_invest',
    'char_lev',
    'char_lgr',
    'char_mve_ia',
    'char_operprof',
    'char_orgcap',
    'char_pchcapx_ia',
    'char_pchcurrat',
    'char_pchdepr',
    'char_pchgm_pchsale',
    'char_pchquick',
    'char_pchsale_pchinvt',
    'char_pchsale_pchrect',
    'char_pchsale_pchxsga',
    'char_pchsaleinv',
    'char_pctacc',
    'char_ps',
    'char_quick',
    'char_rd',
    'char_rd_mve',
    'char_rd_sale',
    'char_realestate',
    'char_roic',
    'char_salecash',
    'char_saleinv',
    'char_salerec',
    'char_secured',
    'char_securedind',
    'char_sgr',
    'char_sin',
    'char_sp',
    'char_tang',
    'char_tb',
    'char_aeavol',
    'char_cash',
    'char_chtx',
    'char_cinvest',
    'char_ear',
    'char_nincr',
    'char_roaq',
    'char_roavol',
    'char_roeq',
    'char_rsup',
    'char_stdacc',
    'char_stdcf',
    'char_ms',
    'char_baspread',
    'char_ill',
    'char_maxret',
    'char_retvol',
    'char_std_dolvol',
    'char_std_turn',
    'char_zerotrade',
]

In [None]:
path = "/work/Thesis/Data/finalized_true.parquet"
target_col = "ret_excess"

vi_global, vi_window = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_predictors,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn4_model.keras",
    nn_weights_path="nn4_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=25,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_global.head(30))

In [None]:
# save 
vi_global.to_csv("NN1_VI_global.csv", index=False)
vi_window.to_csv("NN1_VI_window.csv", index=False)

## With insider trading (outsider)

In [None]:
baseline_insider = [
    'char_mvel1',
    'char_beta',
    'char_betasq',
    'char_chmom',
    'char_dolvol',
    'char_idiovol',
    'char_indmom',
    'char_mom1m',
    'char_mom6m',
    'char_mom12m',
    'char_mom36m',
    'char_pricedelay',
    'char_turn',
    'char_absacc',
    'char_acc',
    'char_age',
    'char_agr',
    'char_bm',
    'char_bm_ia',
    'char_cashdebt',
    'char_cashpr',
    'char_cfp',
    'char_cfp_ia',
    'char_chatoia',
    'char_chcsho',
    'char_chempia',
    'char_chinv',
    'char_chpmia',
    'char_convind',
    'char_currat',
    'char_depr',
    'char_divi',
    'char_divo',
    'char_dy',
    'char_egr',
    'char_ep',
    'char_gma',
    'char_grcapx',
    'char_grltnoa',
    'char_herf',
    'char_hire',
    'char_invest',
    'char_lev',
    'char_lgr',
    'char_mve_ia',
    'char_operprof',
    'char_orgcap',
    'char_pchcapx_ia',
    'char_pchcurrat',
    'char_pchdepr',
    'char_pchgm_pchsale',
    'char_pchquick',
    'char_pchsale_pchinvt',
    'char_pchsale_pchrect',
    'char_pchsale_pchxsga',
    'char_pchsaleinv',
    'char_pctacc',
    'char_ps',
    'char_quick',
    'char_rd',
    'char_rd_mve',
    'char_rd_sale',
    'char_realestate',
    'char_roic',
    'char_salecash',
    'char_saleinv',
    'char_salerec',
    'char_secured',
    'char_securedind',
    'char_sgr',
    'char_sin',
    'char_sp',
    'char_tang',
    'char_tb',
    'char_aeavol',
    'char_cash',
    'char_chtx',
    'char_cinvest',
    'char_ear',
    'char_nincr',
    'char_roaq',
    'char_roavol',
    'char_roeq',
    'char_rsup',
    'char_stdacc',
    'char_stdcf',
    'char_ms',
    'char_baspread',
    'char_ill',
    'char_maxret',
    'char_retvol',
    'char_std_dolvol',
    'char_std_turn',
    'char_zerotrade',
    #Insider trading data
    "is_npr_volume",
    "is_net_cluster",
]

In [None]:
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
target_col = "ret_excess"

vi_global_outsider, vi_window_outsider = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_insider,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn4_outsider_model.keras",
    nn_weights_path="nn4_outsider_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=40,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_global_outsider.head(30))

In [None]:
# save 
vi_global_outsider.to_csv("nn4_outsider_VI_global.csv", index=False)
vi_window_outsider.to_csv("nn4_outsider_VI_window.csv", index=False)

## With insider trading (insider)

In [None]:
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
target_col = "ret_excess"

vi_global_insider, vi_window_insider = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_insider,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn4_insider_model.keras",
    nn_weights_path="nn4_insider_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=35,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_global_insider.head(30))

In [None]:
print("=== Global VI (block-wise, train-only) ===")
print(vi_global_insider_non_lag.head(50))

In [None]:
# save 
vi_global_insider.to_csv("nn4_insider_VI_global.csv", index=False)
vi_window_insider.to_csv("nn4_insider_VI_window.csv", index=False)

# NN3

## Without insider trading

In [None]:
# ==============================================================
# BASELINE PREDICTORS (94) — ONE BLOCK EACH
# ==============================================================

baseline_predictors = [
    'char_mvel1',
    'char_beta',
    'char_betasq',
    'char_chmom',
    'char_dolvol',
    'char_idiovol',
    'char_indmom',
    'char_mom1m',
    'char_mom6m',
    'char_mom12m',
    'char_mom36m',
    'char_pricedelay',
    'char_turn',
    'char_absacc',
    'char_acc',
    'char_age',
    'char_agr',
    'char_bm',
    'char_bm_ia',
    'char_cashdebt',
    'char_cashpr',
    'char_cfp',
    'char_cfp_ia',
    'char_chatoia',
    'char_chcsho',
    'char_chempia',
    'char_chinv',
    'char_chpmia',
    'char_convind',
    'char_currat',
    'char_depr',
    'char_divi',
    'char_divo',
    'char_dy',
    'char_egr',
    'char_ep',
    'char_gma',
    'char_grcapx',
    'char_grltnoa',
    'char_herf',
    'char_hire',
    'char_invest',
    'char_lev',
    'char_lgr',
    'char_mve_ia',
    'char_operprof',
    'char_orgcap',
    'char_pchcapx_ia',
    'char_pchcurrat',
    'char_pchdepr',
    'char_pchgm_pchsale',
    'char_pchquick',
    'char_pchsale_pchinvt',
    'char_pchsale_pchrect',
    'char_pchsale_pchxsga',
    'char_pchsaleinv',
    'char_pctacc',
    'char_ps',
    'char_quick',
    'char_rd',
    'char_rd_mve',
    'char_rd_sale',
    'char_realestate',
    'char_roic',
    'char_salecash',
    'char_saleinv',
    'char_salerec',
    'char_secured',
    'char_securedind',
    'char_sgr',
    'char_sin',
    'char_sp',
    'char_tang',
    'char_tb',
    'char_aeavol',
    'char_cash',
    'char_chtx',
    'char_cinvest',
    'char_ear',
    'char_nincr',
    'char_roaq',
    'char_roavol',
    'char_roeq',
    'char_rsup',
    'char_stdacc',
    'char_stdcf',
    'char_ms',
    'char_baspread',
    'char_ill',
    'char_maxret',
    'char_retvol',
    'char_std_dolvol',
    'char_std_turn',
    'char_zerotrade',
]

In [None]:
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
target_col = "ret_excess"

vi_global_nn3, vi_window_nn3 = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_predictors,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn3_model.keras",
    nn_weights_path="nn3_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=25,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_gvi_global_nn3lobal.head(30))

In [None]:
# save 
vi_global_nn3.to_csv("NN3_VI_global.csv", index=False)
vi_window_nn3.to_csv("NN3_VI_window.csv", index=False)

In [None]:
print(vi_global_nn3.head(30))

## With insider trading (outsider)

In [None]:
baseline_insider = [
    'char_mvel1',
    'char_beta',
    'char_betasq',
    'char_chmom',
    'char_dolvol',
    'char_idiovol',
    'char_indmom',
    'char_mom1m',
    'char_mom6m',
    'char_mom12m',
    'char_mom36m',
    'char_pricedelay',
    'char_turn',
    'char_absacc',
    'char_acc',
    'char_age',
    'char_agr',
    'char_bm',
    'char_bm_ia',
    'char_cashdebt',
    'char_cashpr',
    'char_cfp',
    'char_cfp_ia',
    'char_chatoia',
    'char_chcsho',
    'char_chempia',
    'char_chinv',
    'char_chpmia',
    'char_convind',
    'char_currat',
    'char_depr',
    'char_divi',
    'char_divo',
    'char_dy',
    'char_egr',
    'char_ep',
    'char_gma',
    'char_grcapx',
    'char_grltnoa',
    'char_herf',
    'char_hire',
    'char_invest',
    'char_lev',
    'char_lgr',
    'char_mve_ia',
    'char_operprof',
    'char_orgcap',
    'char_pchcapx_ia',
    'char_pchcurrat',
    'char_pchdepr',
    'char_pchgm_pchsale',
    'char_pchquick',
    'char_pchsale_pchinvt',
    'char_pchsale_pchrect',
    'char_pchsale_pchxsga',
    'char_pchsaleinv',
    'char_pctacc',
    'char_ps',
    'char_quick',
    'char_rd',
    'char_rd_mve',
    'char_rd_sale',
    'char_realestate',
    'char_roic',
    'char_salecash',
    'char_saleinv',
    'char_salerec',
    'char_secured',
    'char_securedind',
    'char_sgr',
    'char_sin',
    'char_sp',
    'char_tang',
    'char_tb',
    'char_aeavol',
    'char_cash',
    'char_chtx',
    'char_cinvest',
    'char_ear',
    'char_nincr',
    'char_roaq',
    'char_roavol',
    'char_roeq',
    'char_rsup',
    'char_stdacc',
    'char_stdcf',
    'char_ms',
    'char_baspread',
    'char_ill',
    'char_maxret',
    'char_retvol',
    'char_std_dolvol',
    'char_std_turn',
    'char_zerotrade',
    #Insider trading data
    "is_npr_volume",
    "is_net_cluster",
]

In [None]:
path = "/work/Thesis/Data/2. Outsider/with_outsider.parquet"
target_col = "ret_excess"

vi_global_outsider_nn3, vi_window_outsider_nn3 = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_insider,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn3_outsider_model.keras",
    nn_weights_path="nn3_outsider_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=40,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_global_outsider_nn3.head(30))

In [None]:
# save 
vi_global_outsider_nn3.to_csv("nn3_outsider_VI_global.csv", index=False)
vi_global_outsider_nn3.to_csv("nn3_outsider_VI_window.csv", index=False)

## With insider trading (insider)

In [None]:
path = "/work/Thesis/Data/3. Insider/with_insider.parquet"
target_col = "ret_excess"

vi_global_insider_nn3, vi_window_insider_nn3 = variable_importance(
    df_path=path,
    target=target_col,
    features_vi=baseline_insider,
    start_year=2005,
    end_year=2021,
    train_size=60,
    val_size=36,
    test_size=12,
    step_size=12,
    nn_model_path="nn3_insider_model.keras",
    nn_weights_path="nn3_insider_ensemble_weights.npz",
    max_rows_per_window=None,   # or 20000 if kernel dies
    random_state=42,
    n_jobs=35,                   # start with 4, not 60, to be safe
    verbose=True,
)

print("=== Global VI (block-wise, train-only) ===")
print(vi_window_insider_nn3.head(30))

In [None]:
# save 
vi_global_insider_nn3.to_csv("nn3_insider_VI_global.csv", index=False)
vi_global_insider_nn3.to_csv("nn3_insider_VI_window.csv", index=False)