In [1]:
"""
test_full_pipeline.py

This script performs a FULL END-TO-END test of the project pipeline using ONLY
one input file: synthetic_full_data.csv

It tests:
    1. Follow-up time calculation
    2. Unimodal XGBoost training
    3. Unimodal prediction + metrics
    4. Cox large-scale regression
"""

import pandas as pd
import numpy as np
import json
import os

# ===== Import project modules =====
from src.statistics.compute_followup import compute_followup_pipeline
from src.models.train_unimodal import train_unimodal_from_config
from src.models.predict_unimodal import main_from_config
from src.statistics.cox_analysis import run_large_scale_cox   # you already generated
# (If your cox module file name differs, adjust the import)

In [2]:
# ================================================================
# Step 0 — Load the ONLY required file
# ================================================================
FULL_DATA = "synthetic_full_data.csv"

assert os.path.exists(FULL_DATA), \
    "ERROR: synthetic_full_data.csv not found. Please generate it first."

df_full = pd.read_csv(FULL_DATA)

print("\n====== 0. Loaded full dataset ======")
print(df_full.head())


   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0   0.496714  -0.138264   0.647689   1.523030  -0.234153  -0.234137   
1   0.324084  -0.385082  -0.676922   0.611676   1.031000   0.931280   
2  -1.415371  -0.420645  -0.342715  -0.802277  -0.161286   0.404051   
3   0.250493   0.346448  -0.680025   0.232254   0.293072  -0.714351   
4   0.357787   0.560785   1.083051   1.053802  -1.377669  -0.937825   

   feature_7  feature_8  feature_9  feature_10  ...  feature_46  feature_47  \
0   1.579213   0.767435  -0.469474    0.542560  ...   -0.719844   -0.460639   
1  -0.839218  -0.309212   0.331263    0.975545  ...   -1.463515    0.296120   
2   1.886186   0.174578   0.257550   -0.074446  ...    0.781823   -1.236951   
3   1.865775   0.473833  -1.191303    0.656554  ...    0.385317   -0.883857   
4   0.515035   0.513786   0.515048    3.852731  ...   -0.334501   -0.474945   

   feature_48  feature_49  feature_50  Participant ID  status    Region  \
0    1.057122    0.343

In [3]:
# ================================================================
# Step 1 — Compute Follow-up Time
# ================================================================
print("\n====== 1. Computing Follow-up ======")

followup = compute_followup_pipeline(
    df_full[["Participant ID",
             "Region",
             "baseline time",
             "Date of death",
             "status"]]
)

followup.to_csv("test_followup.csv", index=False)
print("Follow-up saved to test_followup.csv")
print(followup.head())


Follow-up saved to test_followup.csv
   Participant ID  time  status
0          100000  6050       0
1          100001  4959       0
2          100002  5364       0
3          100003  5504       0
4          100004  5221       0


In [6]:
# ================================================================
# Step 2 — Prepare merged dataset for XGBoost training
# ================================================================
print("\n====== 2. Preparing data for XGBoost ======")

df_train = df_full.merge(
    followup[["Participant ID", "status", "time"]],
    on="Participant ID",
    how="left",
    suffixes=("", "_dup")
)

feature_cols = [col for col in df_full.columns
                if col.startswith("feature_")]

train_input = df_train[["Participant ID"] + feature_cols + ["status"]]
train_input.to_csv("test_unimodal_features.csv", index=False)

print("Training file saved as test_unimodal_features.csv")



Training file saved as test_unimodal_features.csv


In [7]:
# ================================================================
# Step 3 — Train Unimodal XGBoost
# ================================================================
print("\n====== 3. Training Unimodal XGBoost ======")

train_config = {
    "feature_file": "test_unimodal_features.csv",
    "label_column": "status",
    "index_col": "Participant ID",
    "output_model": "test_xgb_model.joblib",
    "n_folds": 3
}

train_unimodal_from_config(train_config)
print("Model saved: test_xgb_model.joblib")


[Fold 1/3] AUC = 0.6686
[Fold 2/3] AUC = 0.6541
[Fold 3/3] AUC = 0.6212

Mean CV AUC = 0.6479983660130718
Model saved => test_xgb_model.joblib
Model saved: test_xgb_model.joblib


In [8]:
# ================================================================
# Step 4 — Predict using Unimodal XGBoost
# ================================================================
print("\n====== 4. Predicting + Evaluating ======")

predict_config = {
    "feature_file": "test_unimodal_features.csv",
    "model_file": "test_xgb_model.joblib",
    "output_predictions": "test_predictions.csv",
    "label_column": "status",
    "index_col": "Participant ID",
    "save_metrics": "test_metrics.json"
}

pred_df, metrics = main_from_config(predict_config)

print("Predictions saved to test_predictions.csv")
print("Metrics:")
print(metrics)


Predictions saved => test_predictions.csv
Metrics saved => test_metrics.json
{'AUC': 0.9139111111111111, 'Average Precision': 0.8245580407033725, 'Accuracy': 0.968}
Predictions saved to test_predictions.csv
Metrics:
{'AUC': 0.9139111111111111, 'Average Precision': 0.8245580407033725, 'Accuracy': 0.968}


In [9]:
# ================================================================
# Step 5 — Run Cox large-scale analysis
# ================================================================
print("\n====== 5. Running Cox Regression ======")

# Cox needs time + status + features
df_cox = df_train.copy()

cox_results = run_large_scale_cox(
    df=df_cox,
    time_col="time",
    status_col="status",
    exclude_cols=["Participant ID"]
)

cox_results.to_csv("test_cox_results.csv", index=False)

print("Cox results saved to test_cox_results.csv")
print(cox_results.head())


print("\n======= FULL PIPELINE TEST COMPLETED SUCCESSFULLY =======")




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


Cox analysis successfully finished. Time used: 0.00 hours.
Cox results saved to test_cox_results.csv
      variable        HR  CI_lower  CI_upper       p_value  n_samples  \
50  status_dup  3.159224  2.700789  3.695473  6.866400e-47        500   
6    feature_7  0.828995  0.681564  1.008317  6.050978e-02        500   
26  feature_27  0.843630  0.691729  1.028889  9.319370e-02        500   
47  feature_48  1.176992  0.968656  1.430135  1.010957e-01        500   
13  feature_14  0.864262  0.710196  1.051751  1.453166e-01        500   

    -log10(p)           FDR significance  
50  46.163271  3.501864e-45           **  
6    1.218174  9.714066e-01               
26   1.030613  9.714066e-01               
47   0.995267  9.714066e-01               
13   0.837685  9.714066e-01               



[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.3s
