In [None]:
import os
import time
import numpy as np
import pandas as pd

from lifelines import CoxPHFitter
from joblib import Parallel, delayed
from statsmodels.stats.multitest import multipletests

In [None]:
DATA_PATH = "your_data.csv"          # Input data file
TIME_COL = "time"             # Follow-up time column
EVENT_COL = "status"          # Event indicator (0/1)
EXCLUDE_COLS = ["Participant.ID"]

MIN_SAMPLES = 100             # Minimum samples per feature
PENALIZER = 0.1               # L2 regularization strength
CHUNK_SIZE = 100              # Number of features per batch
N_JOBS = max(os.cpu_count() - 2, 1)

OUTPUT_FULL = "cox_results_full_with_fdr.csv"
OUTPUT_SIG = "cox_results_significant.csv"

In [None]:
def standardize(series: pd.Series) -> pd.Series:
    """Z-score standardization."""
    return (series - series.mean()) / series.std()


def run_single_cox(df: pd.DataFrame, feature: str) -> dict:
    """
    Fit a univariate Cox model for a single feature.

    Returns a dictionary of results.
    """
    try:
        data = df[[TIME_COL, EVENT_COL, feature]].dropna()

        if len(data) < MIN_SAMPLES:
            raise ValueError(f"Insufficient samples (n={len(data)})")

        data[feature] = standardize(data[feature])

        cph = CoxPHFitter(penalizer=PENALIZER)
        cph.fit(data, duration_col=TIME_COL, event_col=EVENT_COL)

        hr = cph.hazard_ratios_[feature]
        ci_lower, ci_upper = np.exp(cph.confidence_intervals_.loc[feature])
        p_value = cph.summary.loc[feature, "p"]

        return {
            "feature": feature,
            "HR": hr,
            "CI_lower": ci_lower,
            "CI_upper": ci_upper,
            "p_value": p_value,
            "n_samples": len(data),
            "-log10(p)": -np.log10(p_value)
        }

    except Exception as e:
        return {
            "feature": feature,
            "error": str(e)
        }


def process_feature_chunk(df: pd.DataFrame, features: list) -> list:
    """Process a chunk of features."""
    return [run_single_cox(df, feature) for feature in features]

In [None]:
def large_scale_cox(df: pd.DataFrame) -> pd.DataFrame:
    """
    Run univariate Cox regression for all eligible features.
    """
    start_time = time.time()

    # Identify candidate features
    features = [
        col for col in df.columns
        if col not in [TIME_COL, EVENT_COL] + EXCLUDE_COLS
    ]

    print(f"Total features to analyze: {len(features)}")

    # Split into chunks
    feature_chunks = [
        features[i:i + CHUNK_SIZE]
        for i in range(0, len(features), CHUNK_SIZE)
    ]

    # Parallel execution
    results_nested = Parallel(n_jobs=N_JOBS, verbose=10)(
        delayed(process_feature_chunk)(df, chunk)
        for chunk in feature_chunks
    )

    # Flatten results
    results = [
        r for chunk in results_nested for r in chunk
        if "error" not in r
    ]

    results_df = pd.DataFrame(results)

    # FDR correction
    _, fdr, _, _ = multipletests(
        results_df["p_value"].values,
        method="fdr_bh"
    )
    results_df["FDR"] = fdr

    # Significance labels
    results_df["significance"] = np.where(
        results_df["FDR"] < 0.05, "**",
        np.where(results_df["p_value"] < 0.05, "*", "")
    )

    elapsed = (time.time() - start_time) / 3600
    print(f"Analysis completed in {elapsed:.2f} hours")

    return results_df.sort_values("p_value")


In [None]:
if __name__ == "__main__":

    # Load data
    df = pd.read_csv(DATA_PATH)

    # Drop unnecessary columns if present
    df = df.drop(columns=[c for c in ["Participant ID"] if c in df.columns])

    # Run Cox analysis
    cox_results = large_scale_cox(df)

    # Save full results
    cox_results.to_csv(OUTPUT_FULL, index=False)

    # Extract significant results
    significant = cox_results[cox_results["FDR"] < 0.05]
    significant.to_csv(OUTPUT_SIG, index=False)

    print("Top 20 significant features:")
    print(
        significant[
            ["feature", "HR", "CI_lower", "CI_upper", "p_value", "FDR"]
        ].head(20)
    )