In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.utils import concordance_index

In [10]:
df = pd.read_csv("E:/DATN_LVTN/SA/Kaplan_Meier/clinical_K.csv")
df.head()

Unnamed: 0,Case ID,Patient affiliation,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,Quit Smoking Year,%GG,...,Recurrence,Recurrence Location,Date of Recurrence,Date of Last Known Alive,Survival Status,Date of Death,Time to Death (days),CT Date,Days between CT and surgery,PET Date
0,R01-001,Stanford,79,146,Female,Caucasian,Former,37.0,1967.0,0%,...,no,,,2/9/1999,Alive,,,9/6/1990,42,10/17/1990
1,R01-002,Stanford,65,195,Female,Asian,Nonsmoker,,,0%,...,no,,,11/29/1990,Alive,,,9/20/1990,23,10/11/1990
2,R01-003,VA,65,Not Collected,Male,Caucasian,Current,38.0,,0%,...,no,,,10/6/1993,Alive,,,12/10/1991,25,12/19/1991
3,R01-004,VA,67,Not Collected,Male,Caucasian,Former,10.0,1965.0,0%,...,yes,distant,9/29/1993,11/24/1994,Alive,,,9/9/1991,89,10/17/1991
4,R01-005,Stanford,84,145,Male,Native Hawaiian/Pacific Islander,Former,20.0,1951.0,0%,...,yes,distant,1/19/1993,6/3/1995,Dead,6/3/1995,1456.0,3/8/1991,92,4/19/1991


In [None]:

df["Date of Last Known Alive"] = pd.to_datetime(df["Date of Last Known Alive"])
df["CT Date"] = pd.to_datetime(df["CT Date"])
df["Date of Death"] = pd.to_datetime(df["Date of Death"], errors="coerce")  # Xử lý lỗi nếu có giá trị N/A

# Xử lý giá trị N/A trong "Time to Death (days)" cho bệnh nhân còn sống
mask_alive = df["Survival Status"] == "Alive"
df.loc[mask_alive, "Time to Death (days)"] = (df.loc[mask_alive, "Date of Last Known Alive"] - df.loc[mask_alive, "CT Date"]).dt.days

# Tạo cột "Event" (1 nếu chết, 0 nếu còn sống)
df["Survival Status"] = df["Survival Status"].map({"Alive": 0, "Dead": 1})

# Lưu lại file CSV đã xử lý
df.to_csv("processed_data.csv", index=False)

print(df.head())


   Case ID Patient affiliation  Age at Histological Diagnosis   Weight (lbs)  \
0  R01-001            Stanford                             79            146   
1  R01-002            Stanford                             65            195   
2  R01-003                  VA                             65  Not Collected   
3  R01-004                  VA                             67  Not Collected   
4  R01-005            Stanford                             84            145   

   Gender                         Ethnicity Smoking status Pack Years  \
0  Female                         Caucasian         Former         37   
1  Female                             Asian      Nonsmoker        NaN   
2    Male                         Caucasian        Current         38   
3    Male                         Caucasian         Former         10   
4    Male  Native Hawaiian/Pacific Islander         Former         20   

   Quit Smoking Year %GG  ... Recurrence Recurrence Location  \
0             19