In [1]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata


In [2]:
firms = pd.read_csv('Predictors/CompFirmCharac_sanitized.csv')
firms

Unnamed: 0,date,gvkey,acchgy,aolochy,aqcy,capxy,chechy,cibegniy,cicurry,cidergly,...,txbcoy,txdcy,txty,txwy,xidocy,xidoy,xinty,xiy,xopry,xsgay
0,2000-01-31,1013,0.0,-4.824,17.963,46.785,21.138,,,,...,,,15.300,,0.000,0.000,,0.0,496.900,205.600
1,2000-01-31,1082,0.0,-1.734,0.000,9.597,-2.956,,,,...,,1.061,0.590,,2.975,-2.563,4.084,0.0,123.541,6.468
2,2000-01-31,1173,0.0,0.882,0.000,0.558,-0.754,,,,...,,-0.073,0.222,,0.000,0.000,0.416,0.0,21.847,7.457
3,2000-01-31,1183,0.0,-5.582,0.000,1.091,22.224,,,,...,,0.000,-3.286,,0.741,0.741,,0.0,9.732,
4,2000-01-31,1189,0.0,-64.102,0.888,17.181,-12.630,,,,...,,6.349,12.136,,-0.011,-0.007,8.095,0.0,724.161,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159050,2025-03-31,323463,,,,,,,,,...,,,,,,,,,,
1159051,2025-03-31,328087,,,,,,,,,...,,,,,,,,,,
1159052,2025-03-31,330942,,,,,,,,,...,,,,,,,,,,
1159053,2025-03-31,354003,,,,,,,,,...,,,,,,,,,,


🔧 1. preprocess_characteristics(X_raw)

Purpose: Normalize raw firm characteristics per date and handle missing values.

    Filters out stocks with too many missing values (default threshold: 30%).

    For each date:

        Drops date and gvkey.

        Ranks each feature across firms, normalizes to [-0.5, 0.5], and fills missing values with 0.

        Processes in chunks if data is large.

    Returns a dictionary {date: processed_dataframe_with_gvkey}.

🌀 2. generate_random_features(X_processed_dict, P=1200, ...)

Purpose: Generate nonlinear random features (e.g., random Fourier features) per date.

    For each date:

        Multiplies the normalized feature matrix with random weights from a normal distribution scaled by gamma.

        Applies cos and sin to produce transformed features.

        Concatenates all gamma-transformed features → total of P features per stock.

        Rank-normalizes across stocks.

    Returns a dictionary {date: feature_dataframe indexed by gvkey}.

📈 3. compute_random_factors(S_dict, returns_df)

Purpose: Compute random-signal-based risk factors from features and future returns.

    For each date:

        Looks up next month's returns (ret) for the same firms.

        Computes a factor vector: weighted average of features using next-month returns as weights.

        Normalizes by √N.

    Returns a DataFrame: rows = dates, columns = random factors.

✅ Summary:

This pipeline is designed for large-scale financial datasets and performs:

    Feature normalization per firm-date.

    Random nonlinear transformation of characteristics.

    Factor construction using future returns.

It’s optimized for performance and memory efficiency.

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata

def preprocess_characteristics(X_raw, missing_threshold=0.3, chunk_size=5000):
    """Memory-efficient preprocessing with robust chunk handling"""
    dates = X_raw['date'].unique()
    results = {}
    
    for date in dates:
        date_mask = X_raw['date'] == date
        date_chunk = X_raw[date_mask].copy()
        X_date = date_chunk.drop(columns=['date', 'gvkey'])
        
        # Filter stocks
        missing_frac = X_date.isna().mean(axis=1)
        eligible_mask = missing_frac <= missing_threshold
        
        if not eligible_mask.any():
            continue  # Skip dates with no eligible stocks
            
        X_eligible = X_date[eligible_mask]
        gvkeys = date_chunk.loc[eligible_mask, 'gvkey'].values
        
        # Process all at once if small enough, otherwise chunk
        if len(X_eligible) <= chunk_size:
            processed = process_chunk(X_eligible)
            processed['gvkey'] = gvkeys
            results[date] = processed
        else:
            processed_chunks = []
            for i in range(0, len(X_eligible), chunk_size):
                chunk = X_eligible.iloc[i:i+chunk_size]
                processed_chunk = process_chunk(chunk)
                processed_chunks.append(processed_chunk)
            
            if processed_chunks:  # Only concatenate if we have chunks
                X_processed = pd.concat(processed_chunks)
                X_processed['gvkey'] = gvkeys[:len(X_processed)]
                results[date] = X_processed
    
    return results

def process_chunk(chunk):
    """Helper function to process a single chunk"""
    chunk_processed = chunk.copy()
    for col in chunk.columns:
        x = chunk[col]
        non_missing = x.notna()
        x_non_missing = x[non_missing]
        
        if len(x_non_missing) > 0:
            ranks = rankdata(x_non_missing, method='average') - 1
            normalized = ranks / len(x_non_missing) - 0.5
            chunk_processed.loc[non_missing, col] = normalized
        
        chunk_processed.loc[~non_missing, col] = 0
    
    return chunk_processed
    
def generate_random_features_flat(X_processed_dict, P=1200, gamma_grid=[0.5, 0.6, 0.7, 0.8, 0.9, 1.0], seed=None):
    """Generates a flattened DataFrame of random features with MultiIndex (date, gvkey)."""
    if seed is not None:
        np.random.seed(seed)
    
    G = len(gamma_grid)
    P_per_gamma = P // G
    rows = []
    index = []

    # Pre-generate weights once
    sample_X = next(iter(X_processed_dict.values())).drop(columns=['gvkey'])
    d = len(sample_X.columns)
    all_weights = {gamma: np.random.normal(0, gamma, size=(d, P_per_gamma // 2)) for gamma in gamma_grid}

    for date, X_processed in X_processed_dict.items():
        gvkeys = X_processed['gvkey'].values
        X = X_processed.drop(columns=['gvkey']).values.astype(np.float32)
        X = np.nan_to_num(X)
        N_t = X.shape[0]

        S_hat_parts = []
        for gamma in gamma_grid:
            W_g = all_weights[gamma]
            XW = X @ W_g
            S_g = np.concatenate([np.cos(XW), np.sin(XW)], axis=1)
            S_hat_parts.append(S_g)

        S_hat = np.concatenate(S_hat_parts, axis=1)
        S = (rankdata(S_hat, axis=0, method='average') - 1) / N_t - 0.5

        # Append to output lists
        rows.append(S.astype(np.float32))
        index.extend([(date, gvkey) for gvkey in gvkeys])

    # Build final DataFrame
    feature_matrix = np.vstack(rows)
    columns = [f'f{i}' for i in range(P)]
    return pd.DataFrame(feature_matrix, index=pd.MultiIndex.from_tuples(index, names=['date', 'gvkey']), columns=columns)

    
def compute_random_factors(S_dict, returns_df):
    """Memory-efficient factor computation."""
    factors = []
    
    for date, S_df in S_dict.items():
        next_date = pd.to_datetime(date) + pd.offsets.MonthEnd(1)
        next_returns = returns_df[
            (returns_df['date'] == next_date) & 
            (returns_df['gvkey'].isin(S_df.index))
        ]
        
        if not next_returns.empty:
            # Memory-efficient merge
            S_aligned = S_df[S_df.index.isin(next_returns['gvkey'])]
            R_next = next_returns.set_index('gvkey').loc[S_aligned.index, 'ret'].values
            
            # Process in chunks if large
            chunk_size = 10000
            F_chunks = []
            for i in range(0, len(R_next), chunk_size):
                chunk = S_aligned.iloc[i:i+chunk_size].values
                F_chunks.append(R_next[i:i+chunk_size] @ chunk)
            
            F = sum(F_chunks) / np.sqrt(len(R_next))
            factors.append(pd.Series(F, name=date))
    
    return pd.DataFrame(factors)

In [4]:
# Process in smaller chunks
results = preprocess_characteristics(firms, chunk_size=5000)
#features = generate_random_features_flat(results, P=60)
#factors = compute_random_factors(features, returns_df)

In [5]:
results

{'2000-01-31':         acchgy   aolochy      aqcy     capxy    chechy  cibegniy  cicurry  \
 0     0.012485 -0.362069  0.439394  0.389697  0.412010       0.0      0.0   
 1     0.012485 -0.274078 -0.095152  0.194545 -0.291914       0.0      0.0   
 2     0.012485  0.262188 -0.095152 -0.161818 -0.191439       0.0      0.0   
 3     0.012485 -0.375149 -0.095152 -0.080606  0.415577       0.0      0.0   
 4     0.012485 -0.485731  0.346061  0.270909 -0.397741       0.0      0.0   
 ...        ...       ...       ...       ...       ...       ...      ...   
 1105  0.012485 -0.314507 -0.095152  0.228485 -0.163496       0.0      0.0   
 1108  0.012485 -0.039834 -0.095152  0.226061  0.252675       0.0      0.0   
 1109  0.012485  0.456005 -0.095152  0.314545  0.187277       0.0      0.0   
 1111  0.012485  0.142687 -0.095152  0.287879  0.322830       0.0      0.0   
 1112  0.012485  0.106421 -0.095152 -0.205455  0.049941       0.0      0.0   
 
       cidergly  ciothery  cipeny  ...     txdcy

In [7]:
features.to_csv('Predictors/preprocessed_features.csv')