In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sksurv.util import Surv
from sksurv.metrics import cumulative_dynamic_auc
from lifelines import WeibullFitter, ExponentialFitter, LogNormalFitter, LogLogisticFitter, LogNormalAFTFitter

In [6]:
data = pd.read_csv("data_ready_45.csv")

In [7]:
# data["AGE_CMV"] = data["AGE"] * data["CMV_STATUS"]
# data["VIR_CO_INF"] = data["CMV_STATUS"] * data["EBV_SEROSTATUS"]
# data["AGE_BMI_DON"] = data["AGE_DON"] * data["BMI_DON_CALC"]
# data["AGE_BMI"] = data["AGE"] * data["BMI_CALC"]
# data["DIAB_BMI"] = data["YRS_DIAB"] * data["BMI_CALC"]

In [8]:
data["time_frame"] = data["time_frame"] + 1
data["time_frame"] = data["time_frame"] / 12

In [9]:
train, test = train_test_split(data, test_size=0.2, stratify=data["GRF_STAT_PA"], random_state=42)

In [10]:
y_train = train[["time_frame", "GRF_STAT_PA"]]
y_real_train = Surv.from_dataframe("GRF_STAT_PA", "time_frame", y_train)
x_train = train.drop(columns=["time_frame", "GRF_STAT_PA"])

In [11]:
y = test[["time_frame", "GRF_STAT_PA"]]

x_test = test.drop(columns=["time_frame", "GRF_STAT_PA"], axis=1)

In [12]:
from sklearn.linear_model import ElasticNetCV

# Use ElasticNetCV with cross-validation for feature selection
elastic_net = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99], cv=50).fit(x_train, y_train['time_frame'])

# Select features with non-zero coefficients
selected_features = x_train.columns[(elastic_net.coef_ != 0)]
print("Selected features:", selected_features)

# Fit AFT model with selected features
aft = LogNormalAFTFitter()
features = list(selected_features) + ['time_frame', 'GRF_STAT_PA']
aft.fit(train[features], duration_col='time_frame', event_col='GRF_STAT_PA')

Selected features: Index(['PERIP_VASC', 'CREAT_TRR', 'AGE_DON', 'DDAVP_DON', 'CMV_DON', 'BUN_DON',
       'SGOT_DON', 'SGPT_DON', 'TBILI_DON', 'CLIN_INFECT_DON',
       'HIST_OTH_DRUG_DON', 'HGT_CM_DON_CALC', 'WGT_KG_DON_CALC', 'AGE',
       'DUCT_MGMT_2', 'PA_PRESERV_TM', 'DIAG_PA_is5001', 'DAYSWAIT_CHRON_PA',
       'ORGAN_isKP', 'CMV_IGG', 'EBV_SEROSTATUS', 'CMV_STATUS',
       'MED_COND_TRR_is3', 'HGT_CM_CALC', 'WGT_KG_CALC', 'PROTEIN_URINE',
       'LIPASE', 'AMYLASE', 'RESUSCIT_DUR', 'INOTROP_SUPPORT_DON', 'YRS_DIAB'],
      dtype='object')


<lifelines.LogNormalAFTFitter: fitted with 16908 total observations, 12979 right-censored observations>

In [13]:
y_real = Surv.from_dataframe("GRF_STAT_PA", "time_frame", y)

time_points = np.arange(12, 12*10, 12)

preds = aft.predict_cumulative_hazard(df=x_test, times=time_points)

auc, mean_auc = cumulative_dynamic_auc(y_real_train, y_real, preds.T, time_points)

In [14]:
print("Dynamic AUC values at different time points:")
for t, auc in zip(time_points, auc):
    print(f"Time {t:.2f}: AUC = {auc:.3f}")

print(f"\nMean Dynamic AUC: {mean_auc:.3f}")

Dynamic AUC values at different time points:
Time 12.00: AUC = 0.757
Time 24.00: AUC = 0.750
Time 36.00: AUC = 0.727
Time 48.00: AUC = 0.736
Time 60.00: AUC = 0.745
Time 72.00: AUC = 0.744
Time 84.00: AUC = 0.730
Time 96.00: AUC = 0.724
Time 108.00: AUC = 0.723

Mean Dynamic AUC: 0.738


In [15]:
preds_train = pd.DataFrame(aft.predict_expectation(x_train), columns=["AFT"])
preds_test = pd.DataFrame(aft.predict_expectation(x_test), columns=["AFT"])

new_train = pd.concat([train[features], preds_train], axis=1)
new_test = pd.concat([test[features], preds_test], axis=1)

In [16]:
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index

cox_model = CoxPHFitter(penalizer=0.1).fit(new_train, duration_col='time_frame', event_col='GRF_STAT_PA')

In [17]:
preds_cox_train = pd.DataFrame(cox_model.predict_expectation(new_train), columns=["COX"])
preds_cox_test = pd.DataFrame(cox_model.predict_expectation(new_test), columns=["COX"])

new_cox_train = pd.concat([new_train, preds_cox_train], axis=1)
new_cox_test = pd.concat([new_test, preds_cox_test], axis=1)

In [18]:
new_cox_train.to_csv("train_aft.csv", index=False)
new_cox_test.to_csv("test_aft.csv", index=False)

In [19]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.multivariate.manova import MANOVA

# ============================ Load Data ============================
# Load training and testing datasets


# Ensure both datasets have the same columns
assert set(new_cox_train.columns) == set(new_cox_test.columns), "Train and test columns do not match!"

# Identify numerical and binary (categorical) features
num_features = [col for col in new_cox_train.columns if new_cox_train[col].nunique() > 2]  # More than 2 unique values
binary_features = [col for col in new_cox_test.columns if new_cox_test[col].nunique() == 2]  # Exactly 2 unique values

print(f"\nIdentified {len(num_features)} numerical variables and {len(binary_features)} binary variables.")

# ============================ MANOVA (Multivariate Test) ============================
print("\n=== Running MANOVA ===")
combined_df = pd.concat([new_cox_train.assign(group="train"), new_cox_test.assign(group="test")])  # Merge for MANOVA
formula = " + ".join(num_features)  # Only numerical variables for MANOVA

manova = MANOVA.from_formula(f"{formula} ~ group", data=combined_df)
print(manova.mv_test())


Identified 20 numerical variables and 15 binary variables.

=== Running MANOVA ===
                      Multivariate linear model
                                                                      
----------------------------------------------------------------------
       Intercept         Value    Num DF   Den DF     F Value   Pr > F
----------------------------------------------------------------------
          Wilks' lambda   0.0050 20.0000 21115.0000 209809.5500 0.0000
         Pillai's trace   0.9950 20.0000 21115.0000 209809.5500 0.0000
 Hotelling-Lawley trace 198.7303 20.0000 21115.0000 209809.5500 0.0000
    Roy's greatest root 198.7303 20.0000 21115.0000 209809.5500 0.0000
----------------------------------------------------------------------
                                                                      
-----------------------------------------------------------------------
           group           Value    Num DF    Den DF    F Value  Pr > F
--------------

In [22]:
print("\n=== Univariate Analysis: T-tests for Numerical Variables ===")
ttest_results = {}
for col in num_features:
    stat, p = ttest_ind(new_cox_train[col], new_cox_test[col], equal_var=False, nan_policy='omit')
    ttest_results[col] = (stat, p)
    print(f"{col}: t-stat={stat:.3f}, p-value={p:.3f}")

# ============================ Univariate Analysis (Binary Variables) ============================
print("\n=== Univariate Analysis: Chi-Square Tests for Binary Variables ===")
chi2_results = {}
for col in binary_features:
    contingency_table = pd.crosstab(combined_df[col], combined_df["group"])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    chi2_results[col] = (chi2, p)
    print(f"{col}: Chi2={chi2:.3f}, p-value={p:.3f}")


=== Univariate Analysis: T-tests for Numerical Variables ===
CREAT_TRR: t-stat=-1.190, p-value=0.234
AGE_DON: t-stat=0.527, p-value=0.598
BUN_DON: t-stat=0.560, p-value=0.576
SGOT_DON: t-stat=1.765, p-value=0.078
SGPT_DON: t-stat=1.607, p-value=0.108
TBILI_DON: t-stat=-0.528, p-value=0.598
HGT_CM_DON_CALC: t-stat=-1.325, p-value=0.185
WGT_KG_DON_CALC: t-stat=-0.360, p-value=0.719
AGE: t-stat=1.467, p-value=0.142
PA_PRESERV_TM: t-stat=0.417, p-value=0.677
DAYSWAIT_CHRON_PA: t-stat=-0.503, p-value=0.615
HGT_CM_CALC: t-stat=-0.546, p-value=0.585
WGT_KG_CALC: t-stat=1.163, p-value=0.245
LIPASE: t-stat=0.892, p-value=0.373
AMYLASE: t-stat=-0.302, p-value=0.762
RESUSCIT_DUR: t-stat=-0.337, p-value=0.736
YRS_DIAB: t-stat=-0.399, p-value=0.690
time_frame: t-stat=0.439, p-value=0.661
AFT: t-stat=-0.380, p-value=0.704
COX: t-stat=-0.491, p-value=0.624

=== Univariate Analysis: Chi-Square Tests for Binary Variables ===
PERIP_VASC: Chi2=0.001, p-value=0.972
DDAVP_DON: Chi2=2.835, p-value=0.092
CM