In [None]:
pip install -q numpy pandas tqdm scikit-learn fastparquet seaborn doubleml tensorflow scikeras

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

https://docs.doubleml.org/stable/guide/models.html#partially-linear-models-plm
Partially linear regression model (PLR)

https://docs.doubleml.org/stable/examples/py_double_ml_cate_plr.html

In [None]:
import pandas as pd
import numpy as np
import os
import doubleml as dml
from sklearn.neural_network import MLPRegressor
# Removed make_pipeline and StandardScaler imports as requested

# ======================================================
# 1. Setup Paths & Load Feature Lists
# ======================================================
base_path = "/work/Thesis/Data/2. Insider"
data_path = "/work/Thesis/Data/2. Insider/with_insider.parquet"
results_path = "/work/Thesis/Results/"
os.makedirs(results_path, exist_ok=True)

print("Loading feature lists...")

# A. Load Baseline Controls (X)
with open(os.path.join(base_path, "gkx_cols_ex inter.txt"), "r") as f:
    baseline_cols = [line.strip() for line in f if line.strip()]

# B. Load Insider Treatments (D)
with open(os.path.join(base_path, "insider_cols_ex inter.txt"), "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

# C. Define Target (Y)
target_col = "ret_excess"
# D. Define Date Column (CHANGE THIS IF YOUR COLUMN IS NAMED DIFFERENTLY)
date_col = "month" 

print(f"-> Baseline features (Controls): {len(baseline_cols)}")
print(f"-> Insider features (Treatments): {len(insider_cols)}")
print(f"-> Target Variable: {target_col}")

# ======================================================
# 2. Load Data & Filter First 8 Years
# ======================================================
print(f"Loading data from {data_path}...")
df = pd.read_parquet(data_path)

# 2a. Sanity Checks
if target_col not in df.columns:
    raise ValueError(f"CRITICAL ERROR: Target column '{target_col}' not found!")
if date_col not in df.columns:
    raise ValueError(f"CRITICAL ERROR: Date column '{date_col}' not found! Update 'date_col' variable.")

# 2b. Filter for the First 8 Years
print("Filtering data to the first 8 years...")

# Ensure date format
df[date_col] = pd.to_datetime(df[date_col])
df = df.sort_values(date_col)

start_date = df[date_col].min()
cutoff_date = start_date + pd.DateOffset(years=8)

# Slice the dataframe
df = df[df[date_col] < cutoff_date].copy()

print(f"-> Date Range: {start_date.date()} to {cutoff_date.date()}")
print(f"-> Data ready: {len(df)} rows.")

# 2c. Drop NaNs (DoubleML will crash if there are NaNs)
cols_to_check = [target_col] + baseline_cols
df = df.dropna(subset=cols_to_check).reset_index(drop=True)

# ======================================================
# 3. Define the Learners (Neural Networks)
# ======================================================
# Removed StandardScaler as requested.
# Using raw Neural Network.
learner_nn = MLPRegressor(
    hidden_layer_sizes=(64), 
    activation='relu', 
    solver='adam', 
    alpha=0.0001, 
    learning_rate_init=0.001,
    max_iter=60, 
    early_stopping=True,
    random_state=42
)

# ======================================================
# 4. Run the DoubleML Horse Race
# ======================================================
print("\n" + "="*80)
print(f"STARTING AUTO-DML HORSE RACE ({len(insider_cols)} Signals)")
print("="*80)
print(f"{'Signal':<40} | {'Coef':<10} | {'t-stat':<10} | {'P-value':<10}")
print("-" * 80)

results_list = []

for treatment_var in insider_cols:
    
    # 1. Check if treatment exists in DF
    if treatment_var not in df.columns:
        print(f"Skipping {treatment_var:<40} (Not found)")
        continue
        
    # 2. Drop NaNs specific to this treatment
    df_sub = df.dropna(subset=[treatment_var])
    
    if len(df_sub) < 500: # Lowered threshold slightly since we cut sample size
        print(f"Skipping {treatment_var:<40} (Too sparse: {len(df_sub)})")
        continue

    # 3. Initialize DoubleML Data Object
    dml_data = dml.DoubleMLData(
        df_sub,
        y_col=target_col,
        d_cols=treatment_var,
        x_cols=baseline_cols
    )
    
    # 4. Initialize Model (Partially Linear Regression)
    dml_plr = dml.DoubleMLPLR(
        dml_data,
        ml_l=learner_nn, 
        ml_m=learner_nn, 
        n_folds=5,       
        score='partialling out'
    )
    
    # 5. Fit & Extract
    try:
        dml_plr.fit()
        
        summary = dml_plr.summary
        coef = summary.loc[treatment_var, "coef"]
        pval = summary.loc[treatment_var, "P>|t|"]
        tstat = summary.loc[treatment_var, "t"]
        
        # Live Print
        print(f"{treatment_var:<40} | {coef:.4f}     | {tstat:.4f}     | {pval:.4f}")
        
        # Store
        results_list.append({
            "Signal": treatment_var,
            "Coefficient": coef,
            "t_stat": tstat,
            "P_value": pval,
            "Significant": pval < 0.05
        })
        
    except Exception as e:
        print(f"Error fitting {treatment_var}: {e}")

# ======================================================
# 5. Save Final Results
# ======================================================
if results_list:
    df_results = pd.DataFrame(results_list)
    # Sort by significance (highest absolute t-stat)
    df_results = df_results.sort_values("t_stat", ascending=False, key=abs)

    print("\n" + "="*80)
    print("FINAL DML RESULTS (Sorted by Significance)")
    print("="*80)
    print(df_results.to_string(index=False))

    # Save to CSV
    out_file = os.path.join(results_path, "dml_insider_inference_8years.csv")
    df_results.to_csv(out_file, index=False)
    print(f"\nSaved detailed results to: {out_file}")
else:
    print("\nNo results generated. Check column names or data.")

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import doubleml as dml

# Import Keras components
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import L1
# Note: You need 'scikeras' installed: pip install scikeras
from scikeras.wrappers import KerasRegressor 

# ======================================================
# 0. CPU / Threading Configuration (MAX SPEED)
# ======================================================
# Set environment variables BEFORE initializing TensorFlow
os.environ["OMP_NUM_THREADS"] = "96"
os.environ["OPENBLAS_NUM_THREADS"] = "96"
os.environ["MKL_NUM_THREADS"] = "96"
os.environ["VECLIB_MAXIMUM_THREADS"] = "96"
os.environ["NUMEXPR_NUM_THREADS"] = "96"
# TensorFlow specific
os.environ["TF_NUM_INTRAOP_THREADS"] = "96"
os.environ["TF_NUM_INTEROP_THREADS"] = "8"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Force CPU
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# Configure TensorFlow threading
tf.config.threading.set_intra_op_parallelism_threads(96)
tf.config.threading.set_inter_op_parallelism_threads(8)

print(f"CPU Threading Configured: 96 Intra-op / 8 Inter-op threads.")

# ======================================================
# 1. Setup Paths & Load Feature Lists
# ======================================================
base_path = "/work/Thesis/Data/1. Pre Variable Selection/2. Insider"
data_path = "/work/Thesis/Data/1. Pre Variable Selection/2. Insider/with_insider.parquet"
results_path = "/work/Thesis/Results/"
os.makedirs(results_path, exist_ok=True)

print("Loading feature lists...")

with open(os.path.join(base_path, "gkx_cols_ex inter.txt"), "r") as f:
    baseline_cols = [line.strip() for line in f if line.strip()]

with open(os.path.join(base_path, "insider_cols_ex inter.txt"), "r") as f:
    insider_cols = [line.strip() for line in f if line.strip()]

target_col = "ret_excess"
date_col = "month"

print(f"-> Baseline features (Controls): {len(baseline_cols)}")
print(f"-> Insider features (Treatments): {len(insider_cols)}")

# ======================================================
# 2. Load Data & Filter First 8 Years
# ======================================================
print(f"Loading data from {data_path}...")
df = pd.read_parquet(data_path)

# Filter for First 8 Years
df[date_col] = pd.to_datetime(df[date_col])
df = df.sort_values(date_col)

start_date = df[date_col].min()
cutoff_date = start_date + pd.DateOffset(years=8)
df = df[df[date_col] < cutoff_date].copy()

print(f"-> Date Range: {start_date.date()} to {cutoff_date.date()}")
print(f"-> Data ready: {len(df)} rows.")

# Drop NaNs for Baseline/Target
cols_to_check = [target_col] + baseline_cols
df = df.dropna(subset=cols_to_check).reset_index(drop=True)

# ======================================================
# 3. Define the Keras Model (With L1 & Batch Norm)
# ======================================================
def create_keras_model(input_dim, l1_reg=1e-4, lr=0.001):
    model = Sequential()
    # Layer 1
    model.add(Dense(64, activation='relu', input_shape=(input_dim,), 
                    kernel_regularizer=L1(l1_reg), use_bias=False))
    model.add(BatchNormalization())
    
    # Layer 2
    model.add(Dense(32, activation='relu', 
                    kernel_regularizer=L1(l1_reg), use_bias=False))
    model.add(BatchNormalization())
    
    # Output
    model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    return model

# Setup wrapper with Early Stopping
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

# Initialize the Learner
# Note: KerasRegressor will handle the 'input_dim' automatically if not passed, 
# but passing it explicitly via arguments ensures clarity.
input_dim = len(baseline_cols)

learner_nn = KerasRegressor(
    model=create_keras_model,
    model__input_dim=input_dim,
    model__l1_reg=0.0001,
    model__lr=0.001,
    epochs=100,
    batch_size=128,
    verbose=0,
    callbacks=[early_stop],
    validation_split=0.1,
    random_state=42
)

print(f"Neural Network Learner initialized (L1=0.0001, LR=0.001)")

# ======================================================
# 4. Run the DoubleML Horse Race
# ======================================================
print("\n" + "="*80)
print(f"STARTING AUTO-DML HORSE RACE ({len(insider_cols)} Signals)")
print("="*80)
print(f"{'Signal':<40} | {'Coef':<10} | {'t-stat':<10} | {'P-value':<10}")
print("-" * 80)

results_list = []

for treatment_var in insider_cols:
    
    if treatment_var not in df.columns:
        continue
        
    # Drop NaNs specific to this treatment
    df_sub = df.dropna(subset=[treatment_var])
    
    if len(df_sub) < 500:
        continue

    # Initialize DoubleML Data
    dml_data = dml.DoubleMLData(
        df_sub,
        y_col=target_col,
        d_cols=treatment_var,
        x_cols=baseline_cols
    )
    
    # Initialize PLR Model
    # Note: DoubleML will handle parallelization of cross-fitting folds internally
    # if supported, but typically runs sequential for DL models to avoid VRAM/Resource conflicts.
    # Since we set CPU threads to 96, TF will use all cores for EACH model training.
    dml_plr = dml.DoubleMLPLR(
        dml_data,
        ml_l=learner_nn, 
        ml_m=learner_nn, 
        n_folds=5,      
        score='partialling out'
    )
    
    try:
        dml_plr.fit()
        
        summary = dml_plr.summary
        coef = summary.loc[treatment_var, "coef"]
        pval = summary.loc[treatment_var, "P>|t|"]
        tstat = summary.loc[treatment_var, "t"]
        
        print(f"{treatment_var:<40} | {coef:.4f}     | {tstat:.4f}     | {pval:.4f}")
        
        results_list.append({
            "Signal": treatment_var,
            "Coefficient": coef,
            "t_stat": tstat,
            "P_value": pval,
            "Significant": pval < 0.05
        })
        
        # Explicit garbage collection to free memory between iterations
        tf.keras.backend.clear_session()
        gc.collect()
        
    except Exception as e:
        print(f"Error fitting {treatment_var}: {e}")

# ======================================================
# 5. Save Final Results
# ======================================================
if results_list:
    df_results = pd.DataFrame(results_list)
    df_results = df_results.sort_values("t_stat", ascending=False, key=abs)

    print("\n" + "="*80)
    print("FINAL DML RESULTS")
    print("="*80)
    print(df_results.to_string(index=False))

    out_file = os.path.join(results_path, "dml_insider_inference_8years.csv")
    df_results.to_csv(out_file, index=False)
    print(f"\nSaved results to: {out_file}")
else:
    print("\nNo results generated.")