In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sksurv.util import Surv
from sksurv.metrics import cumulative_dynamic_auc
from lifelines import WeibullFitter, ExponentialFitter, LogNormalFitter, LogLogisticFitter, LogNormalAFTFitter

In [2]:
data = pd.read_csv("data_ready_45.csv")

In [3]:
# data["AGE_CMV"] = data["AGE"] * data["CMV_STATUS"]
# data["VIR_CO_INF"] = data["CMV_STATUS"] * data["EBV_SEROSTATUS"]
# data["AGE_BMI_DON"] = data["AGE_DON"] * data["BMI_DON_CALC"]
# data["AGE_BMI"] = data["AGE"] * data["BMI_CALC"]
# data["DIAB_BMI"] = data["YRS_DIAB"] * data["BMI_CALC"]

In [4]:
data["time_frame"] = data["time_frame"] + 1
data["time_frame"] = data["time_frame"] / 12

In [5]:
train, test = train_test_split(data, test_size=0.2, stratify=data["GRF_STAT_PA"], random_state=42)

In [6]:
y_train = train[["time_frame", "GRF_STAT_PA"]]
y_real_train = Surv.from_dataframe("GRF_STAT_PA", "time_frame", y_train)
x_train = train.drop(columns=["time_frame", "GRF_STAT_PA"])

In [7]:
y = test[["time_frame", "GRF_STAT_PA"]]

x_test = test.drop(columns=["time_frame", "GRF_STAT_PA"], axis=1)

In [8]:
from sklearn.linear_model import ElasticNetCV

# Use ElasticNetCV with cross-validation for feature selection
elastic_net = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99], cv=50).fit(x_train, y_train['time_frame'])

# Select features with non-zero coefficients
selected_features = x_train.columns[(elastic_net.coef_ != 0)]
print("Selected features:", selected_features)

# Fit AFT model with selected features
aft = LogNormalAFTFitter()
features = list(selected_features) + ['time_frame', 'GRF_STAT_PA']
aft.fit(train[features], duration_col='time_frame', event_col='GRF_STAT_PA')

Selected features: Index(['PERIP_VASC', 'CREAT_TRR', 'AGE_DON', 'DDAVP_DON', 'CMV_DON', 'BUN_DON',
       'SGOT_DON', 'SGPT_DON', 'TBILI_DON', 'CLIN_INFECT_DON',
       'HIST_OTH_DRUG_DON', 'HGT_CM_DON_CALC', 'WGT_KG_DON_CALC', 'AGE',
       'DUCT_MGMT_2', 'PA_PRESERV_TM', 'DIAG_PA_is5001', 'DAYSWAIT_CHRON_PA',
       'ORGAN_isKP', 'CMV_IGG', 'EBV_SEROSTATUS', 'CMV_STATUS',
       'MED_COND_TRR_is3', 'HGT_CM_CALC', 'WGT_KG_CALC', 'PROTEIN_URINE',
       'LIPASE', 'AMYLASE', 'RESUSCIT_DUR', 'INOTROP_SUPPORT_DON', 'YRS_DIAB'],
      dtype='object')


<lifelines.LogNormalAFTFitter: fitted with 16908 total observations, 12979 right-censored observations>

In [9]:
y_real = Surv.from_dataframe("GRF_STAT_PA", "time_frame", y)

time_points = np.arange(12, 12*10, 12)

preds = aft.predict_cumulative_hazard(df=x_test, times=time_points)

auc, mean_auc = cumulative_dynamic_auc(y_real_train, y_real, preds.T, time_points)

In [10]:
print("Dynamic AUC values at different time points:")
for t, auc in zip(time_points, auc):
    print(f"Time {t:.2f}: AUC = {auc:.3f}")

print(f"\nMean Dynamic AUC: {mean_auc:.3f}")

Dynamic AUC values at different time points:
Time 12.00: AUC = 0.757
Time 24.00: AUC = 0.750
Time 36.00: AUC = 0.727
Time 48.00: AUC = 0.736
Time 60.00: AUC = 0.745
Time 72.00: AUC = 0.744
Time 84.00: AUC = 0.730
Time 96.00: AUC = 0.724
Time 108.00: AUC = 0.723

Mean Dynamic AUC: 0.738


In [11]:
preds_train = pd.DataFrame(aft.predict_expectation(x_train), columns=["AFT"])
preds_test = pd.DataFrame(aft.predict_expectation(x_test), columns=["AFT"])

new_train = pd.concat([train[features], preds_train], axis=1)
new_test = pd.concat([test[features], preds_test], axis=1)

In [12]:
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

cox_model = CoxPHFitter(penalizer=0.1).fit(new_train, duration_col='time_frame', event_col='GRF_STAT_PA')

In [13]:
preds_cox_train = pd.DataFrame(cox_model.predict_expectation(new_train), columns=["COX"])
preds_cox_test = pd.DataFrame(cox_model.predict_expectation(new_test), columns=["COX"])

new_cox_train = pd.concat([new_train, preds_cox_train], axis=1)
new_cox_test = pd.concat([new_test, preds_cox_test], axis=1)

In [14]:
new_cox_train.to_csv("train_aft.csv", index=False)
new_cox_test.to_csv("test_aft.csv", index=False)

In [None]:
# trying the vars in the main table remove for actual

In [24]:
new_cols = ["AGE", "GENDER", "BMI_CALC", "YRS_DIAB", "AGE_DON", "GENDER_DON", "BMI_DON_CALC", "ORGAN_isKP", "DUCT_MGMT_2", 
            "PA_PRESERV_TM", "GRF_STAT_PA", "time_frame"]

In [26]:
new_cox_train = train[new_cols]
new_cox_test = test[new_cols]

In [27]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.multivariate.manova import MANOVA

# ============================ Load Data ============================
# Load training and testing datasets


# Ensure both datasets have the same columns
assert set(new_cox_train.columns) == set(new_cox_test.columns), "Train and test columns do not match!"

# Identify numerical and binary (categorical) features
num_features = [col for col in new_cox_train.columns if new_cox_train[col].nunique() > 2]  # More than 2 unique values
binary_features = [col for col in new_cox_test.columns if new_cox_test[col].nunique() == 2]  # Exactly 2 unique values

print(f"\nIdentified {len(num_features)} numerical variables and {len(binary_features)} binary variables.")

# ============================ MANOVA (Multivariate Test) ============================
print("\n=== Running MANOVA ===")
combined_df = pd.concat([new_cox_train.assign(group="train"), new_cox_test.assign(group="test")])  # Merge for MANOVA
formula = " + ".join(num_features)  # Only numerical variables for MANOVA

manova = MANOVA.from_formula(f"{formula} ~ group", data=combined_df)
print(manova.mv_test())


Identified 7 numerical variables and 5 binary variables.

=== Running MANOVA ===
                     Multivariate linear model
                                                                   
-------------------------------------------------------------------
       Intercept         Value  Num DF   Den DF    F Value   Pr > F
-------------------------------------------------------------------
          Wilks' lambda  0.0487 7.0000 21128.0000 58950.9978 0.0000
         Pillai's trace  0.9513 7.0000 21128.0000 58950.9978 0.0000
 Hotelling-Lawley trace 19.5313 7.0000 21128.0000 58950.9978 0.0000
    Roy's greatest root 19.5313 7.0000 21128.0000 58950.9978 0.0000
-------------------------------------------------------------------
                                                                   
-------------------------------------------------------------------
             group          Value  Num DF   Den DF   F Value Pr > F
-------------------------------------------------------

In [31]:
# ============================ Univariate Analysis (Numerical Variables) ============================
num_results = {}
for col in num_features:
    combined_mean = round(combined_df[col].mean(), 2)
    train_mean = round(new_cox_train[col].mean(), 2)
    test_mean = round(new_cox_test[col].mean(), 2)

    combined_std = round(combined_df[col].std(), 2)
    train_std = round(new_cox_train[col].std(), 2)
    test_std = round(new_cox_test[col].std(), 2)

    stat, p = ttest_ind(new_cox_train[col], new_cox_test[col], equal_var=False, nan_policy='omit')
    p = round(p, 4)  # Round p-value to 4 decimal places

    num_results[col] = [f"{combined_mean} ({combined_std})", f"{train_mean} ({train_std})", f"{test_mean} ({test_std})", p]

# ============================ Univariate Analysis (Binary Variables) ============================
bin_results = {}
for col in binary_features:
    combined_percent = round(combined_df[col].mean() * 100, 2)  # Percentage of 1s
    train_percent = round(new_cox_train[col].mean() * 100, 2)
    test_percent = round(new_cox_test[col].mean() * 100, 2)

    contingency_table = pd.crosstab(combined_df[col], combined_df["group"])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    p = round(p, 4)  # Round p-value

    bin_results[col] = [f"{combined_percent}%", f"{train_percent}%", f"{test_percent}%", p]

# ============================ Create Final Table with Preserved Row Order ============================
final_results = []
for col in new_cox_train.columns:  # Preserve original column order
    if col in num_results:
        final_results.append([col] + num_results[col])
    elif col in bin_results:
        final_results.append([col] + bin_results[col])

# Create DataFrame with correct order
final_table = pd.DataFrame(
    final_results,
    columns=["Variable", "Combined (Mean ± SD or %)", "Train (Mean ± SD or %)", "Test (Mean ± SD or %)", "p-value"]
)

# Print final table
final_table

Unnamed: 0,Variable,Combined (Mean ± SD or %),Train (Mean ± SD or %),Test (Mean ± SD or %),p-value
0,AGE,42.23 (9.12),42.28 (9.11),42.05 (9.17),0.1423
1,GENDER,40.73%,40.77%,40.59%,0.8439
2,BMI_CALC,25.48 (4.01),25.5 (4.01),25.38 (3.99),0.0816
3,YRS_DIAB,26.15 (10.32),26.14 (10.32),26.21 (10.33),0.6899
4,AGE_DON,24.82 (9.07),24.84 (9.08),24.76 (9.02),0.5981
5,GENDER_DON,30.82%,30.95%,30.32%,0.4398
6,BMI_DON_CALC,23.9 (3.91),23.9 (3.91),23.88 (3.9),0.718
7,ORGAN_isKP,76.63%,76.56%,76.92%,0.6349
8,DUCT_MGMT_2,70.6%,70.72%,70.08%,0.4222
9,PA_PRESERV_TM,11.22 (4.92),11.22 (4.95),11.19 (4.8),0.677
