In [2]:
# neutralize_factors_sp500.py
"""
Neutralize factors by market cap (log) and industry dummies, per date.

Inputs (expected in ./output/):
 - factors_merged.csv       : merged factors (datetime,instrument,factor_... )
 - df_ltsz.csv              : market_cap pivot (index=datetime, columns=code)
 - industry_dummies.csv     : industry dummy matrix (index=code)

Output:
 - factors_neutralized.csv  : neutralized & standardized factors with columns:
     datetime, instrument, <factor columns (neutralized)>
"""

import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from tqdm import tqdm

OUTPUT_DIR = os.path.abspath("./output")
FACTORS_MERGED = os.path.join(OUTPUT_DIR, "factors_merged.csv")
DF_LTSZ = os.path.join(OUTPUT_DIR, "df_ltsz.csv")
INDUS = os.path.join(OUTPUT_DIR, "industry_dummies.csv")
OUT_NEUT = os.path.join(OUTPUT_DIR, "factors_neutralized.csv")

def load_inputs():
    print("Loading inputs...")
    fac = pd.read_csv(FACTORS_MERGED, parse_dates=['datetime'])
    ltsz = pd.read_csv(DF_LTSZ, index_col=0, parse_dates=True)
    indus = pd.read_csv(INDUS, index_col=0)
    # ensure index types
    ltsz.index = pd.to_datetime(ltsz.index, utc=True)
    return fac, ltsz, indus

def neutralize_per_date(fac_df, ltsz_pivot, indus_df):
    """
    fac_df: long format DataFrame with columns ['datetime','instrument',factor_...]
    ltsz_pivot: pivoted market_cap DataFrame index=datetime columns=tickers
    indus_df: index=tickers -> dummy cols

    Returns: DataFrame with index same as fac_df (grouped rows) containing neutralized factor cols plus datetime & instrument
    """
    factor_cols = [c for c in fac_df.columns if c not in ['datetime','instrument']]
    out_rows = []
    # group by datetime
    fac_df['datetime'] = pd.to_datetime(fac_df['datetime'], utc=True)
    grouped = fac_df.groupby('datetime')
    all_dates = sorted(fac_df['datetime'].unique())

    for date in tqdm(all_dates, desc="Neutralizing dates"):
        group = grouped.get_group(date).set_index('instrument')
        # get market cap series for this date (align tickers)
        try:
            mc = ltsz_pivot.loc[date]
        except KeyError:
            # no market cap data for this date -> skip (or create NaNs)
            mc = pd.Series(index=group.index, dtype=float)

        # log market cap; coerce nonpositive to NaN
        mc = pd.to_numeric(mc, errors='coerce')
        mc = mc.replace(0, np.nan)
        lmc = np.log(mc)
        lmc.name = 'log_mktcap'

        # assemble industry dummies for the tickers in this group
        ind_sub = indus_df.reindex(group.index).fillna(0)

        # for each factor col, run OLS: factor ~ log_mktcap + industry_dummies
        result_factors = {}
        for fcol in factor_cols:
            y = group[fcol].to_frame(name='y')
            # if column entirely nan -> keep as NaN
            if y['y'].notna().sum() == 0:
                result_factors[fcol] = pd.Series(index=group.index, dtype=float)
                continue

            # build reg matrix: concat lmc and ind_sub
            X = pd.concat([lmc.reindex(group.index), ind_sub], axis=1)
            X = X.astype(float)
            # drop columns that are all nan
            X = X.loc[:, X.notna().any(axis=0)]
            # drop rows that have all X missing or y missing
            dfjoint = pd.concat([y, X], axis=1)
            dfjoint = dfjoint.dropna(subset=['y'], how='all')  # keep rows where y exists
            # If after dropping there's nothing, skip
            if dfjoint.shape[0] == 0 or dfjoint.iloc[:,1:].shape[1] == 0:
                # no regressors available -> just zscore y
                resid = y['y']
            else:
                y_reg = dfjoint['y']
                X_reg = dfjoint.iloc[:,1:]  # regressors
                # add const? No const is fine since we want residuals relative to regressors
                # but statsmodels requires explicit constant if desired. We'll keep no constant to mimic earlier code.
                try:
                    model = sm.OLS(y_reg, X_reg, missing='drop')
                    res = model.fit()
                    fitted = res.fittedvalues
                    resid = y['y'].copy()
                    # subtract fitted only for indices present
                    resid.loc[fitted.index] = y.loc[fitted.index, fcol] - fitted
                except Exception:
                    # fallback: if regression fails, set resid to y
                    resid = y['y']

            # standardize residuals (z-score)
            resid = resid.replace([np.inf, -np.inf], np.nan)
            if resid.notna().sum() >= 2:
                resid = (resid - resid.mean()) / resid.std(ddof=0)
            else:
                resid = resid * np.nan  # insufficient data

            result_factors[fcol] = resid

        # combine into DataFrame for this date
        df_out_date = pd.DataFrame(result_factors)
        df_out_date.index.name = 'instrument'
        df_out_date = df_out_date.reset_index()
        df_out_date['datetime'] = date
        out_rows.append(df_out_date)

    out_df = pd.concat(out_rows, ignore_index=True, sort=False)
    # reorder columns
    cols = ['datetime','instrument'] + factor_cols
    cols = [c for c in cols if c in out_df.columns]
    out_df = out_df[cols]
    return out_df

if __name__ == "__main__":
    fac, ltsz, indus = load_inputs()
    print("Running neutralization...")
    neut = neutralize_per_date(fac, ltsz, indus)
    neut.to_csv(OUT_NEUT, index=False)
    print("Saved neutralized factors to", OUT_NEUT)
    print("Finished.")


Loading inputs...
Running neutralization...


Neutralizing dates: 100%|██████████| 2515/2515 [57:18<00:00,  1.37s/it]  


Saved neutralized factors to c:\Users\ns243\Documents\Academic\AI Master\Internship\Codes\output\factors_neutralized.csv
Finished.
