# COX REGRESSION (sklearn adapter)

In [25]:
# Step 1: get all prerand patients
import pandas as pd

vgg_df = pd.read_excel("vgg_final.xlsx", engine='openpyxl')
target_df = pd.read_excel("vgg_labels.xlsx", engine='openpyxl')

merged_df = pd.merge(vgg_df, target_df, on='PP')
merged_df = merged_df.set_index('PP')
vgg_df.shape, target_df.shape

((102, 4097), (102, 3))

## Modeling with Train and Test splits

### variance threshold, scaling, pca

#### repeated cross validation

In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from lifelines.utils.sklearn_adapter import sklearn_adapter
from lifelines import CoxPHFitter
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

X = merged_df.drop('Duration', axis=1)  # keep as a dataframe
y = merged_df['Duration']

# Define the number of folds and repeats, and an empty list to store the scores
num_folds = 4
num_repeats = 10
scores = []

for r in range(num_repeats):
    print(f"--- Repeat: {r + 1} ---")
    
    # Generate indices for splits
    np.random.seed(r)  # change the seed for each repeat
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_sizes = (len(X) // num_folds) * np.ones(num_folds, dtype=int)  # equally divide indices
    fold_sizes[:len(X) % num_folds] += 1  # if len(X) is not exactly divisible by num_folds, assign remainder to first few

    current = 0
    splits = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        splits.append((indices[start:stop], np.concatenate((indices[:start], indices[stop:]))))  # (test, train)
        current = stop

    # Perform cross-validation
    for i, (test_idx, train_idx) in enumerate(splits):
        print(f"Fold: {i + 1}")
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Drop 'Event' column
        X_train_new = X_train.drop('Event', axis=1)
        X_test_new = X_test.drop('Event', axis=1)
        
        # Scale the data
        scaler = StandardScaler()
        X_train_new = scaler.fit_transform(X_train_new)
        X_test_new = scaler.transform(X_test_new)
        
        # Perform PCA
        pca = PCA(n_components=24)
        X_train_new = pca.fit_transform(X_train_new)
        X_test_new = pca.transform(X_test_new)
        
        # Convert back to DataFrame
        X_train_new = pd.DataFrame(X_train_new)
        X_test_new = pd.DataFrame(X_test_new)
        
        # Add back indices
        X_train_new.index = X_train.index
        X_test_new.index = X_test.index
        
        # Add 'Event' column back
        X_train_new['Event'] = X_train['Event']
        X_test_new['Event'] = X_test['Event']
        
        # Instantiate CoxRegression object
        CoxRegression = sklearn_adapter(CoxPHFitter, event_col='Event')
        sk_cph = CoxRegression(penalizer=1e-5)
        sk_cph.fit(X_train_new, y_train)
        
        # Calculate and store the score
        score = sk_cph.score(X_test_new, y_test)
        scores.append(score)

        print(f"Score: {score}")
        print("-----------------------------")

# Calculate mean
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")

--- Repeat: 1 ---
Fold: 1
Score: 0.6192170818505338
-----------------------------
Fold: 2




Score: 0.7541666666666667
-----------------------------
Fold: 3




Score: 0.4927536231884058
-----------------------------
Fold: 4
Score: 0.5901060070671378
-----------------------------
--- Repeat: 2 ---
Fold: 1
Score: 0.6328502415458938
-----------------------------
Fold: 2




Score: 0.6408450704225352
-----------------------------
Fold: 3




Score: 0.5555555555555556
-----------------------------
Fold: 4




Score: 0.647887323943662
-----------------------------
--- Repeat: 3 ---
Fold: 1
Score: 0.447098976109215
-----------------------------
Fold: 2




Score: 0.6357615894039735
-----------------------------
Fold: 3
Score: 0.684
-----------------------------
Fold: 4




Score: 0.5731225296442688
-----------------------------
--- Repeat: 4 ---
Fold: 1
Score: 0.6711864406779661
-----------------------------
Fold: 2




Score: 0.5223367697594502
-----------------------------
Fold: 3




Score: 0.6615384615384615
-----------------------------
Fold: 4
Score: 0.5910931174089069
-----------------------------
--- Repeat: 5 ---
Fold: 1
Score: 0.504885993485342
-----------------------------
Fold: 2
Score: 0.5166666666666667
-----------------------------
Fold: 3
Score: 0.604
-----------------------------
Fold: 4
Score: 0.6785714285714286
-----------------------------
--- Repeat: 6 ---
Fold: 1




Score: 0.6779026217228464
-----------------------------
Fold: 2
Score: 0.5949820788530465
-----------------------------
Fold: 3
Score: 0.50390625
-----------------------------
Fold: 4
Score: 0.5878136200716846
-----------------------------
--- Repeat: 7 ---
Fold: 1
Score: 0.5709459459459459
-----------------------------
Fold: 2




Score: 0.6593959731543624
-----------------------------
Fold: 3
Score: 0.6883116883116883
-----------------------------
Fold: 4
Score: 0.5227272727272727
-----------------------------
--- Repeat: 8 ---
Fold: 1
Score: 0.5945945945945946
-----------------------------
Fold: 2




Score: 0.6153846153846154
-----------------------------
Fold: 3




Score: 0.6371308016877637
-----------------------------
Fold: 4
Score: 0.6039215686274509
-----------------------------
--- Repeat: 9 ---
Fold: 1
Score: 0.55893536121673
-----------------------------
Fold: 2




Score: 0.565359477124183
-----------------------------
Fold: 3
Score: 0.3916349809885932
-----------------------------
Fold: 4
Score: 0.5419847328244275
-----------------------------
--- Repeat: 10 ---
Fold: 1
Score: 0.6749116607773852
-----------------------------
Fold: 2
Score: 0.5372549019607843
-----------------------------
Fold: 3




Score: 0.5977443609022557
-----------------------------
Fold: 4
Score: 0.5035460992907801
-----------------------------
Mean Score: 0.591300803741812




# Random Forest Survival

#### 10 different cv with 5 folds

In [27]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sksurv.ensemble import RandomSurvivalForest
from sklearn.decomposition import PCA

X = merged_df.drop(['Duration', 'Event'], axis=1)  # keep as a dataframe
y = merged_df[['Event','Duration']]

#change y to array of tuples (Event, Duration)
y = np.array([(bool(arr[0]), arr[1]) for arr in y.values], dtype=[('boolean', bool), ('integer', int)])

# Define the number of folds, repeats and an empty list to store the scores
num_folds = 5
num_repeats = 10
scores = []

# Create the RepeatedKFold object
rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=1)

# Perform Repeated K-Fold cross-validation
for i, (train_index, test_index) in enumerate(rkf.split(X)):
    print(f"Fold: {(i % num_folds) + 1}, Repeat: {i // num_folds + 1}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Scale the data
    scaler = StandardScaler()
    scaler = MinMaxScaler()
    X_train_new = scaler.fit_transform(X_train)
    X_test_new = scaler.transform(X_test)
    
    # Perform PCA
    pca = PCA(n_components=20)
    X_train_new = pca.fit_transform(X_train_new)
    X_test_new = pca.transform(X_test_new)
    
    # Convert back to DataFrame
    X_train_new = pd.DataFrame(X_train_new)
    X_test_new = pd.DataFrame(X_test_new)
    
    # Add back indices
    X_train_new.index = X_train.index
    X_test_new.index = X_test.index
    
    # Instantiate RandomSurvivalForest object
    rsf = RandomSurvivalForest()
    rsf.fit(X_train_new, y_train)
    
    # Calculate and store the score
    score = rsf.score(X_test_new, y_test)
    scores.append(score)
    
    print(f"Score: {score}")
    print("-----------------------------")

# Calculate mean and 95% confidence interval of the scores
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")


Fold: 1, Repeat: 1
Score: 0.5912408759124088
-----------------------------
Fold: 2, Repeat: 1
Score: 0.6097560975609756
-----------------------------
Fold: 3, Repeat: 1
Score: 0.46368715083798884
-----------------------------
Fold: 4, Repeat: 1
Score: 0.4519774011299435
-----------------------------
Fold: 5, Repeat: 1
Score: 0.6666666666666666
-----------------------------
Fold: 1, Repeat: 2
Score: 0.485
-----------------------------
Fold: 2, Repeat: 2
Score: 0.5487804878048781
-----------------------------
Fold: 3, Repeat: 2
Score: 0.5611111111111111
-----------------------------
Fold: 4, Repeat: 2
Score: 0.5471698113207547
-----------------------------
Fold: 5, Repeat: 2
Score: 0.41721854304635764
-----------------------------
Fold: 1, Repeat: 3
Score: 0.6228070175438597
-----------------------------
Fold: 2, Repeat: 3
Score: 0.6410256410256411
-----------------------------
Fold: 3, Repeat: 3
Score: 0.5054945054945055
-----------------------------
Fold: 4, Repeat: 3
Score: 0.54545454

# GDB survival (ComponentwiseGradientBoostingSurvivalAnalysis)

#### 10 different cv with 5 folds

In [28]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sklearn.decomposition import PCA

X = merged_df.drop(['Duration', 'Event'], axis=1)  # keep as a dataframe
y = merged_df[['Event','Duration']]

#change y to array of tuples (Event, Duration)
y = np.array([(bool(arr[0]), arr[1]) for arr in y.values], dtype=[('boolean', bool), ('integer', int)])

# Define the number of folds, repeats and an empty list to store the scores
num_folds = 5
num_repeats = 10
scores = []

# Create the RepeatedKFold object
rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=1)

# Perform Repeated K-Fold cross-validation
for i, (train_index, test_index) in enumerate(rkf.split(X)):
    print(f"Fold: {(i % num_folds) + 1}, Repeat: {i // num_folds + 1}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Scale the data
    scaler = StandardScaler()
    X_train_new = scaler.fit_transform(X_train)
    X_test_new = scaler.transform(X_test)
    
    # Perform PCA
    pca = PCA(n_components=22)
    X_train_new = pca.fit_transform(X_train_new)
    X_test_new = pca.transform(X_test_new)
    
    # Convert back to DataFrame
    X_train_new = pd.DataFrame(X_train_new)
    X_test_new = pd.DataFrame(X_test_new)
    
    # Add back indices
    X_train_new.index = X_train.index
    X_test_new.index = X_test.index
    
    # Instantiate GB object
    gdb = ComponentwiseGradientBoostingSurvivalAnalysis(loss="coxph")
    gdb.fit(X_train_new, y_train)
    
    # Calculate and store the score
    score = gdb.score(X_test_new, y_test)
    scores.append(score)
    
    print(f"Score: {score}")
    print("-----------------------------")

# Calculate mean and 95% confidence interval of the scores
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")

Fold: 1, Repeat: 1
Score: 0.656934306569343
-----------------------------
Fold: 2, Repeat: 1
Score: 0.524390243902439
-----------------------------
Fold: 3, Repeat: 1
Score: 0.659217877094972
-----------------------------
Fold: 4, Repeat: 1
Score: 0.4293785310734463
-----------------------------
Fold: 5, Repeat: 1
Score: 0.7043010752688172
-----------------------------
Fold: 1, Repeat: 2
Score: 0.54
-----------------------------
Fold: 2, Repeat: 2
Score: 0.4329268292682927
-----------------------------
Fold: 3, Repeat: 2
Score: 0.6444444444444445
-----------------------------
Fold: 4, Repeat: 2
Score: 0.5723270440251572
-----------------------------
Fold: 5, Repeat: 2
Score: 0.3708609271523179
-----------------------------
Fold: 1, Repeat: 3
Score: 0.5175438596491229
-----------------------------
Fold: 2, Repeat: 3
Score: 0.6923076923076923
-----------------------------
Fold: 3, Repeat: 3
Score: 0.5934065934065934
-----------------------------
Fold: 4, Repeat: 3
Score: 0.57792207792207