In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lifelines import CoxPHFitter
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold


In [2]:
npz_data = np.load('fusion_features.npz')
features = npz_data['features']  # Shape (144, 1024)
patient_ids = npz_data['patient_ids']  # Shape (144,)

event_data = pd.read_csv('processed_data.csv')

In [3]:
feature_columns = [f'feature_{i}' for i in range(features.shape[1])]
features_df = pd.DataFrame(features, columns=feature_columns)

features_df['Case ID'] = patient_ids
features_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_1015,feature_1016,feature_1017,feature_1018,feature_1019,feature_1020,feature_1021,feature_1022,feature_1023,Case ID
0,0.0,0.0,0.0,0.0,0.0,2.188303,0.0,0.0,0.0,0.218016,...,0.0,0.0,0.0,0.0,0.609032,0.0,0.0,2.118962,0.0,lung_001
1,0.0,0.0,0.821452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.072781,0.0,1.842743,0.0,lung_002
2,0.949904,0.019688,1.282775,0.0,0.0,1.265904,0.0,0.0,0.993333,0.0,...,0.0,1.956519,0.0,0.0,0.478154,1.516294,0.0,1.078762,0.0,lung_003
3,0.189173,0.196573,1.5099,0.0,0.350664,1.77023,0.193847,0.010835,0.0,0.0,...,0.391427,0.059741,1.224578,0.0,1.500176,0.213759,0.0,0.843074,0.0,lung_004
4,0.0,0.0,0.0,0.842689,0.706076,0.990063,0.0,0.0,0.0,0.0,...,0.0,0.0,2.151799,0.0,0.0,0.4584,0.0,0.601398,0.0,lung_005


In [4]:
event_data.head()

Unnamed: 0,Case ID,Age,Weight (lbs),Gender,Ethnicity,Smoking status,%GG,Tumor Location (choice=RUL),Tumor Location (choice=RML),Tumor Location (choice=RLL),...,Std,Min,Max,Median,SurfaceArea,Elongation,Flatness,Roundness,Time to Event,Event
0,lung_001,79,146.0,0,2,1,0,0,1,1,...,194.164635,-812,154,-56.236328,826.137989,1.436361,1.446431,0.699813,3078,0
1,lung_002,65,195.0,0,1,2,0,0,1,1,...,173.439744,-829,144,-36.021484,1037.374063,1.367921,1.112439,0.781205,70,0
2,lung_003,65,173.5,1,2,0,0,1,1,1,...,172.357348,-815,290,23.177734,755.268235,1.387373,1.089084,0.822009,666,0
3,lung_004,67,173.5,1,2,1,0,1,1,1,...,254.147443,-1024,366,21.595703,912.514223,1.593605,1.63112,0.639694,1172,0
4,lung_005,84,145.0,1,4,1,0,1,0,1,...,107.583454,-783,391,7.496094,2432.30509,1.27808,1.16629,0.735654,1456,1


In [5]:
event_data_subset = event_data[['Case ID', 'Time to Event', 'Event']]
combined_df = pd.merge(features_df, event_data_subset, on='Case ID', how='inner')
combined_df.shape[0]

144

In [6]:
features_cols = [col for col in combined_df.columns if col.startswith('feature_')]
metadata_cols = [col for col in combined_df.columns if not col.startswith('feature_')]

X = combined_df[features_cols]
metadata = combined_df[metadata_cols]

In [7]:
var_threshold = VarianceThreshold(threshold=0.01)  # Ngưỡng phương sai
X_filtered = var_threshold.fit_transform(X)

filtered_features_cols = np.array(features_cols)[var_threshold.get_support()]
X_filtered_df = pd.DataFrame(X_filtered, columns=filtered_features_cols)

In [8]:
filtered_df = pd.concat([X_filtered_df, metadata], axis=1)

In [9]:
print(f"Số lượng features ban đầu: {len(features_cols)}")
print(f"Số lượng features sau khi lọc: {len(filtered_features_cols)}")
print(f"Các features đã bị loại bỏ: {len(features_cols) - len(filtered_features_cols)}")

Số lượng features ban đầu: 1024
Số lượng features sau khi lọc: 810
Các features đã bị loại bỏ: 214


In [10]:
train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)

In [11]:
train_df = train_df.drop(columns=['Case ID'])
test_df = test_df.drop(columns=['Case ID'])

In [12]:
cph = CoxPHFitter(penalizer=0.1, l1_ratio=0.5)
cph.fit(train_df, duration_col='Time to Event', event_col='Event', show_progress=True)

Iteration 1: norm_delta = 2.82e+00, step_size = 0.9500, log_lik = -5148.57068, newton_decrement = 1.41e+02, seconds_since_start = 1.1
Iteration 2: norm_delta = 2.09e+00, step_size = 0.9500, log_lik = -3922.50286, newton_decrement = 3.23e+01, seconds_since_start = 2.2
Iteration 3: norm_delta = 3.08e+00, step_size = 0.9500, log_lik = -3046.78469, newton_decrement = 6.43e+01, seconds_since_start = 3.3
Iteration 4: norm_delta = 6.08e+00, step_size = 0.9310, log_lik = -2445.05343, newton_decrement = 2.54e+02, seconds_since_start = 4.5
Iteration 5: norm_delta = 3.05e+00, step_size = 0.2281, log_lik = -1872.69962, newton_decrement = 7.85e+01, seconds_since_start = 5.6
Iteration 6: norm_delta = 1.56e+00, step_size = 0.2906, log_lik = -1446.76844, newton_decrement = 2.48e+01, seconds_since_start = 6.8
Iteration 7: norm_delta = 2.73e-01, step_size = 0.4911, log_lik = -1127.53095, newton_decrement = 2.40e+00, seconds_since_start = 8.0
Iteration 8: norm_delta = 1.63e-01, step_size = 0.8300, log_li

<lifelines.CoxPHFitter: fitted with 115 total observations, 73 right-censored observations>

In [13]:
cph.print_summary()  # access the individual results using cph.summary
with open("results_CoxPH.txt", "w") as f:
    f.write(cph.summary.to_string() + "\n")
    f.write(f"Concordance Index: {cph.concordance_index_}\n")

0,1
model,lifelines.CoxPHFitter
duration col,'Time to Event'
event col,'Event'
penalizer,0.1
l1 ratio,0.5
baseline estimation,breslow
number of observations,115
number of events observed,42
partial log-likelihood,-160.52
time fit was run,2025-04-26 16:42:34 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
feature_0,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.0,1.0,0.0
feature_1,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.0,1.0,0.0
feature_2,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
feature_3,-0.05,0.96,1.68,-3.34,3.25,0.04,25.75,0.0,-0.03,0.98,0.03
feature_4,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
feature_5,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.0,1.0,0.0
feature_6,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
feature_8,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-0.0,1.0,0.0
feature_9,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
feature_10,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0

0,1
Concordance,0.96
Partial AIC,1941.04
log-likelihood ratio test,42.76 on 810 df
-log2(p) of ll-ratio test,-0.00


In [14]:
test_ci = cph.score(test_df, scoring_method="concordance_index")
print(test_ci)

0.6486486486486487
