# COX REGRESSION (sklearn adapter)

In [20]:
# BEST data (98 patients)
import pandas as pd

radiomics_df = pd.read_excel("survival_radiomics.xlsx", engine='openpyxl')
target_df = pd.read_excel("db_basis_survival.xlsx", engine='openpyxl')

common_indices = radiomics_df['PP']

## EHR

In [21]:
directory = r"L:\basic\divi\jstoker\slicer_pdac\Master Students SS 23\Mattia\survival.xlsx"
ehr = pd.read_excel(directory, usecols='B,C,H,I,J,K,L', nrows=137).iloc[1:,:]

#change to int
ehr['Randomisatie nummer'] = ehr['Randomisatie nummer'].astype('int')

#create new_column_id
ehr['PP'] = ehr['Studie'] + '-' + ehr['Randomisatie nummer'].astype(str)

#drop unecessary columnns
ehr = ehr.drop(['Randomisatie nummer', 'Studie'], axis=1)

ehr = ehr.set_index('PP')

ehr = ehr.loc[common_indices]

new_columns = {ehr.columns[0]: 'age', ehr.columns[1]: 'tumor diameter', ehr.columns[2]: 'tumor location', ehr.columns[3]: 'resection margin', ehr.columns[4]: 'nat'}
ehr = ehr.rename(columns = new_columns)

ehr

Unnamed: 0_level_0,age,tumor diameter,tumor location,resection margin,nat
PP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PP1-125,63.468493,3.5,1,none,3
PP1-136,68.898630,3.5,1,R0,1
PP1-137,59.961644,2.7,1,R1,3
PP1-145,68.693151,4.0,4,R0,3
PP1-15,73.613699,3.0,1,R0,3
...,...,...,...,...,...
PP2-77,63.106849,3.5,4,R1,2
PP2-8,66.153425,,1,R0,3
PP2-87,54.260274,4.7,2,R0,3
PP2-9,63.109589,2.4,1,R1,3


#### tumor location

In [22]:
#change 2.6 with mode
mode = int(ehr['tumor location'].mode().values)
ehr.loc['PP2-321', 'tumor location'] = mode
ehr.loc['PP2-321']

age                 60.380822
tumor diameter            1.0
tumor location              1
resection margin           R0
nat                         2
Name: PP2-321, dtype: object

In [23]:
ehr['tumor location'].unique()

array([1, 4, '1 *Groove', 1.3, 2, 3, 1.2], dtype=object)

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Replacing '1 *Groove' with 1
ehr['tumor location'] = ehr['tumor location'].replace('1 *Groove', 1)

# Creating two new columns '2' and '3' for categories 2 and 3
ehr['2'] = 0
ehr['3'] = 0

# Update these columns whenever we encounter 1.2 or 1.3
ehr.loc[ehr['tumor location'] == 1.2, '2'] = 1
ehr.loc[ehr['tumor location'] == 1.3, '3'] = 1

# Also replacing 1.2 and 1.3 with 1
ehr['tumor location'] = ehr['tumor location'].replace([1.2, 1.3], 1)

# One-hot-encode the 'tumor location'
onehotencoder = OneHotEncoder()
ehr_encoded = onehotencoder.fit_transform(ehr['tumor location'].values.reshape(-1,1)).toarray()

# Creating a DataFrame from our encoded array
ehr_encoded = pd.DataFrame(ehr_encoded, columns = ["tumor_location_"+str(int(i)) for i in range(ehr_encoded.shape[1])])

# set same index also for ehr_encoded
ehr_encoded = ehr_encoded.set_index(ehr.index)

# Concatenating the original DataFrame and the one-hot-encoded DataFrame
ehr = pd.concat([ehr, ehr_encoded], axis=1)

#adding values from column 2 and 3
ehr['tumor_location_2'] = ehr['tumor_location_2'] + ehr['2']
ehr['tumor_location_3'] = ehr['tumor_location_3'] + ehr['3']

# Dropping the '2' and '3' columns
ehr = ehr.drop(columns=['2', '3', 'tumor location'])

#### resection margin

In [25]:
ehr['resection margin'].unique()

array(['none', 'R0', 'R1'], dtype=object)

In [26]:
onehotencoder = OneHotEncoder()
ehr_encoded = onehotencoder.fit_transform(ehr['resection margin'].values.reshape(-1,1)).toarray()

# Creating a DataFrame from our encoded array
ehr_encoded = pd.DataFrame(ehr_encoded, columns = ["resection_margin_"+str(int(i)) for i in range(ehr_encoded.shape[1])])

# set same index also for ehr_encoded
ehr_encoded = ehr_encoded.set_index(ehr.index)

# Concatenating the original DataFrame and the one-hot-encoded DataFrame
ehr = pd.concat([ehr, ehr_encoded], axis=1)

# drop
ehr = ehr.drop(columns=['resection margin'])

#### nat

In [27]:
ehr['nat'].unique()

array([3, 1, 2, nan], dtype=object)

In [28]:
# Calculate the mode of the 'nat' column
mode_value = ehr['nat'].mode()[0]

# Fill NaN values with the mode value
ehr['nat'] = ehr['nat'].fillna(mode_value)

onehotencoder = OneHotEncoder()
ehr_encoded = onehotencoder.fit_transform(ehr['nat'].values.reshape(-1,1)).toarray()

# Creating a DataFrame from our encoded array
ehr_encoded = pd.DataFrame(ehr_encoded, columns = ["nat_"+str(int(i)) for i in range(ehr_encoded.shape[1])])

# set same index also for ehr_encoded
ehr_encoded = ehr_encoded.set_index(ehr.index)

# Concatenating the original DataFrame and the one-hot-encoded DataFrame
ehr = pd.concat([ehr, ehr_encoded], axis=1)

# drop
ehr = ehr.drop(columns=['nat'])

In [29]:
ehr.head()

Unnamed: 0_level_0,age,tumor diameter,tumor_location_0,tumor_location_1,tumor_location_2,tumor_location_3,resection_margin_0,resection_margin_1,resection_margin_2,nat_0,nat_1,nat_2
PP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PP1-125,63.468493,3.5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
PP1-136,68.89863,3.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
PP1-137,59.961644,2.7,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
PP1-145,68.693151,4.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
PP1-15,73.613699,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [30]:
#WEIIIIIRDDDDDD
ehr[ehr['nat_0']==1]

Unnamed: 0_level_0,age,tumor diameter,tumor_location_0,tumor_location_1,tumor_location_2,tumor_location_3,resection_margin_0,resection_margin_1,resection_margin_2,nat_0,nat_1,nat_2
PP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PP1-136,68.89863,3.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
PP1-150,55.838356,2.8,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
PP1-16,67.109589,3.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
PP1-74,64.487671,3.7,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
PP1-96,56.224658,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
PP2-164,76.076712,4.2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
PP2-21,73.887671,2.9,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
PP2-25,63.178082,3.4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
PP2-28,63.106849,4.3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
PP2-375,76.824658,3.6,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [31]:
ehr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98 entries, PP1-125 to PP2-98
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 98 non-null     float64
 1   tumor diameter      97 non-null     float64
 2   tumor_location_0    98 non-null     float64
 3   tumor_location_1    98 non-null     float64
 4   tumor_location_2    98 non-null     float64
 5   tumor_location_3    98 non-null     float64
 6   resection_margin_0  98 non-null     float64
 7   resection_margin_1  98 non-null     float64
 8   resection_margin_2  98 non-null     float64
 9   nat_0               98 non-null     float64
 10  nat_1               98 non-null     float64
 11  nat_2               98 non-null     float64
dtypes: float64(12)
memory usage: 12.0+ KB


#### tumor diamater

In [32]:
# Calculate the mean of the 'nat' column
mean_value = ehr['tumor diameter'].mean()

# Fill NaN values with the mode value
ehr['tumor diameter'] = ehr['tumor diameter'].fillna(mean_value)

### Concatenate with dataset

In [33]:
merged_df = pd.concat([radiomics_df.set_index('PP'), ehr, target_df.set_index('PP')], axis=1)

In [34]:
merged_df.head()

Unnamed: 0_level_0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,tumor_location_2,tumor_location_3,resection_margin_0,resection_margin_1,resection_margin_2,nat_0,nat_1,nat_2,Event,Duration
PP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PP1-125,0.865911,0.748555,21.837019,29.172213,38.483233,32.381093,32.789316,39.096507,10859.239912,25.260528,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,91
PP1-136,0.935666,0.637434,20.390476,31.98839,38.869955,38.472968,37.509999,40.147221,12548.056447,29.930447,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,58
PP1-137,0.773641,0.540651,19.654688,36.353718,37.639068,44.020531,42.106296,44.535432,12536.421418,28.124714,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1479
PP1-145,0.698209,0.607823,25.259802,41.557809,48.118506,45.542161,42.675846,48.199269,18673.257574,29.016045,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1627
PP1-15,0.667218,0.510924,13.471777,26.367465,27.666261,28.074574,21.950195,29.468954,3908.542665,17.592842,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,3489


In [35]:
merged_df.iloc[:,1108:]

Unnamed: 0_level_0,tumor_location_0,tumor_location_1,tumor_location_2,tumor_location_3,resection_margin_0,resection_margin_1,resection_margin_2,nat_0,nat_1,nat_2,Event,Duration
PP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PP1-125,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1,91
PP1-136,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,58
PP1-137,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1479
PP1-145,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1,1627
PP1-15,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,3489
...,...,...,...,...,...,...,...,...,...,...,...,...
PP2-77,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1,414
PP2-8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,408
PP2-87,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1,368
PP2-9,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,1314


## Modeling with Train and Test splits

### variance threshold, scaling, pca

#### only on one test split

#### repeated cross validation

In [36]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from lifelines.utils.sklearn_adapter import sklearn_adapter
from lifelines import CoxPHFitter
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

X = merged_df.drop('Duration', axis=1)  # keep as a dataframe
y = merged_df['Duration']

# Define the number of folds and repeats, and an empty list to store the scores
num_folds = 5
num_repeats = 10
scores = []

for r in range(num_repeats):
    print(f"--- Repeat: {r + 1} ---")
    
    # Generate indices for splits
    np.random.seed(r+420)  # change the seed for each repeat
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_sizes = (len(X) // num_folds) * np.ones(num_folds, dtype=int)  # equally divide indices
    fold_sizes[:len(X) % num_folds] += 1  # if len(X) is not exactly divisible by num_folds, assign remainder to first few

    current = 0
    splits = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        splits.append((indices[start:stop], np.concatenate((indices[:start], indices[stop:]))))  # (test, train)
        current = stop

    # Perform cross-validation
    for i, (test_idx, train_idx) in enumerate(splits):
        print(f"Fold: {i + 1}")
        
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Drop 'Event' column
        X_train_new = X_train.drop('Event', axis=1)
        X_test_new = X_test.drop('Event', axis=1)
        
        #change to array
        X_train_new = X_train_new.values
        X_test_new = X_test_new.values
        
        # Scale the data
        scaler = StandardScaler()
        scaler = MinMaxScaler()
        X_train_new[:, :1108] = scaler.fit_transform(X_train_new[:, :1108])
        X_test_new[:, :1108] = scaler.transform(X_test_new[:, :1108])
        
        # Perform PCA
        pca = PCA(n_components=30)
        X_train_new_pca = pca.fit_transform(X_train_new[:, :1106])
        X_test_new_pca = pca.transform(X_test_new[:, :1106])
        X_train_new = np.concatenate((X_train_new_pca, X_train_new[:,1106:]), axis=1)
        X_test_new = np.concatenate((X_test_new_pca, X_test_new[:,1106:]), axis=1)
        
        # Convert back to DataFrame
        X_train_new = pd.DataFrame(X_train_new)
        X_test_new = pd.DataFrame(X_test_new)
        
        # Add back indices
        X_train_new.index = X_train.index
        X_test_new.index = X_test.index
        
        # Add 'Event' column back
        X_train_new['Event'] = X_train['Event']
        X_test_new['Event'] = X_test['Event']
        
        # Instantiate CoxRegression object
        CoxRegression = sklearn_adapter(CoxPHFitter, event_col='Event')
        sk_cph = CoxRegression(penalizer=1e-5)
        sk_cph.fit(X_train_new, y_train)
        
        # Calculate and store the score
        score = sk_cph.score(X_test_new, y_test)
        scores.append(score)

        print(f"Score: {score}")
        print("-----------------------------")

# Calculate mean
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")

--- Repeat: 1 ---
Fold: 1
Score: 0.6453488372093024
-----------------------------
Fold: 2
Score: 0.63125
-----------------------------
Fold: 3
Score: 0.6153846153846154
-----------------------------
Fold: 4
Score: 0.7194244604316546
-----------------------------
Fold: 5



>>> events = df['Event'].astype(bool)
>>> print(df.loc[events, '38'].var())
>>> print(df.loc[~events, '38'].var())

A very low variance means that the column 38 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



Score: 0.7006369426751592
-----------------------------
--- Repeat: 2 ---
Fold: 1
Score: 0.69375
-----------------------------
Fold: 2
Score: 0.47191011235955055
-----------------------------
Fold: 3
Score: 0.6319018404907976
-----------------------------
Fold: 4
Score: 0.695364238410596
-----------------------------
Fold: 5
Score: 0.6938775510204082
-----------------------------
--- Repeat: 3 ---
Fold: 1
Score: 0.7378048780487805
-----------------------------
Fold: 2
Score: 0.7077922077922078
-----------------------------
Fold: 3
Score: 0.6569767441860465
-----------------------------
Fold: 4
Score: 0.717948717948718
-----------------------------
Fold: 5
Score: 0.6644736842105263
-----------------------------
--- Repeat: 4 ---
Fold: 1
Score: 0.64375
-----------------------------
Fold: 2
Score: 0.7176470588235294
-----------------------------
Fold: 3
Score: 0.5636363636363636
-----------------------------
Fold: 4
Score: 0.5902777777777778
-----------------------------
Fold: 5
Score: 0.

# Random Forest Survival

#### 10 different cv with 5 folds

In [37]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sksurv.ensemble import RandomSurvivalForest
from sklearn.decomposition import PCA

X = merged_df.drop(['Duration', 'Event'], axis=1)  # keep as a dataframe
y = merged_df[['Event','Duration']]

#change y to array of tuples (Event, Duration)
y = np.array([(bool(arr[0]), arr[1]) for arr in y.values], dtype=[('boolean', bool), ('integer', int)])

# Define the number of folds, repeats and an empty list to store the scores
num_folds = 5
num_repeats = 10
scores = []

# Create the RepeatedKFold object
rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=1)

# Perform Repeated K-Fold cross-validation
for i, (train_index, test_index) in enumerate(rkf.split(X)):
    print(f"Fold: {(i % num_folds) + 1}, Repeat: {i // num_folds + 1}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
     #change to array
    X_train_new = X_train.values
    X_test_new = X_test.values
    
    # Scale the data
    scaler = StandardScaler()
    scaler = MinMaxScaler()
    X_train_new[:, :1108] = scaler.fit_transform(X_train_new[:, :1108])
    X_test_new[:, :1108] = scaler.transform(X_test_new[:, :1108])
    
    # Perform PCA
    pca = PCA(n_components=30)
    X_train_new_pca = pca.fit_transform(X_train_new[:, :1106])
    X_test_new_pca = pca.transform(X_test_new[:, :1106])
    X_train_new = np.concatenate((X_train_new_pca, X_train_new[:,1106:]), axis=1)
    X_test_new = np.concatenate((X_test_new_pca, X_test_new[:,1106:]), axis=1)
    
    # Convert back to DataFrame
    X_train_new = pd.DataFrame(X_train_new)
    X_test_new = pd.DataFrame(X_test_new)
    
    # Add back indices
    X_train_new.index = X_train.index
    X_test_new.index = X_test.index
    
    # Instantiate RandomSurvivalForest object
    rsf = RandomSurvivalForest()
    rsf.fit(X_train_new, y_train)
    
    # Calculate and store the score
    score = rsf.score(X_test_new, y_test)
    scores.append(score)
    
    print(f"Score: {score}")
    print("-----------------------------")

# Calculate mean and 95% confidence interval of the scores
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")

Fold: 1, Repeat: 1
Score: 0.7172413793103448
-----------------------------
Fold: 2, Repeat: 1
Score: 0.6133333333333333
-----------------------------
Fold: 3, Repeat: 1
Score: 0.6397849462365591
-----------------------------
Fold: 4, Repeat: 1
Score: 0.6075949367088608
-----------------------------
Fold: 5, Repeat: 1
Score: 0.7612903225806451
-----------------------------
Fold: 1, Repeat: 2
Score: 0.7777777777777778
-----------------------------
Fold: 2, Repeat: 2
Score: 0.6939890710382514
-----------------------------
Fold: 3, Repeat: 2
Score: 0.5418994413407822
-----------------------------
Fold: 4, Repeat: 2
Score: 0.5743243243243243
-----------------------------
Fold: 5, Repeat: 2
Score: 0.6379310344827587
-----------------------------
Fold: 1, Repeat: 3
Score: 0.7714285714285715
-----------------------------
Fold: 2, Repeat: 3
Score: 0.6277777777777778
-----------------------------
Fold: 3, Repeat: 3
Score: 0.569620253164557
-----------------------------
Fold: 4, Repeat: 3
Score: 

# GDB survival (ComponentwiseGradientBoostingSurvivalAnalysis)

#### 10 different cv with 5 folds

In [38]:
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis
from sklearn.decomposition import PCA

X = merged_df.drop(['Duration', 'Event'], axis=1)  # keep as a dataframe
y = merged_df[['Event','Duration']]

#change y to array of tuples (Event, Duration)
y = np.array([(bool(arr[0]), arr[1]) for arr in y.values], dtype=[('boolean', bool), ('integer', int)])

# Define the number of folds, repeats and an empty list to store the scores
num_folds = 5
num_repeats = 10
scores = []

# Create the RepeatedKFold object
rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=1)

# Perform Repeated K-Fold cross-validation
for i, (train_index, test_index) in enumerate(rkf.split(X)):
    print(f"Fold: {(i % num_folds) + 1}, Repeat: {i // num_folds + 1}")
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #change to array
    X_train_new = X_train.values
    X_test_new = X_test.values
    
    # Scale the data
    scaler = StandardScaler()
    scaler = MinMaxScaler()
    X_train_new[:, :1108] = scaler.fit_transform(X_train_new[:, :1108])
    X_test_new[:, :1108] = scaler.transform(X_test_new[:, :1108])
    
    # Perform PCA
    pca = PCA(n_components=30)
    X_train_new_pca = pca.fit_transform(X_train_new[:, :1106])
    X_test_new_pca = pca.transform(X_test_new[:, :1106])
    X_train_new = np.concatenate((X_train_new_pca, X_train_new[:,1106:]), axis=1)
    X_test_new = np.concatenate((X_test_new_pca, X_test_new[:,1106:]), axis=1)
    
    # Convert back to DataFrame
    X_train_new = pd.DataFrame(X_train_new)
    X_test_new = pd.DataFrame(X_test_new)
    
    # Add back indices
    X_train_new.index = X_train.index
    X_test_new.index = X_test.index
    
    # Instantiate GB object
    gdb = ComponentwiseGradientBoostingSurvivalAnalysis(loss="coxph")
    gdb.fit(X_train_new, y_train)
    
    # Calculate and store the score
    score = gdb.score(X_test_new, y_test)
    scores.append(score)
    
    print(f"Score: {score}")
    print("-----------------------------")

# Calculate mean and 95% confidence interval of the scores
mean_score = np.mean(scores)

print(f"Mean Score: {mean_score}")

Fold: 1, Repeat: 1
Score: 0.6689655172413793
-----------------------------
Fold: 2, Repeat: 1
Score: 0.5266666666666666
-----------------------------
Fold: 3, Repeat: 1
Score: 0.6505376344086021
-----------------------------
Fold: 4, Repeat: 1
Score: 0.4810126582278481
-----------------------------
Fold: 5, Repeat: 1
Score: 0.6709677419354839
-----------------------------
Fold: 1, Repeat: 2
Score: 0.6783625730994152
-----------------------------
Fold: 2, Repeat: 2
Score: 0.73224043715847
-----------------------------
Fold: 3, Repeat: 2
Score: 0.5977653631284916
-----------------------------
Fold: 4, Repeat: 2
Score: 0.5202702702702703
-----------------------------
Fold: 5, Repeat: 2
Score: 0.8189655172413793
-----------------------------
Fold: 1, Repeat: 3
Score: 0.7085714285714285
-----------------------------
Fold: 2, Repeat: 3
Score: 0.6
-----------------------------
Fold: 3, Repeat: 3
Score: 0.6645569620253164
-----------------------------
Fold: 4, Repeat: 3
Score: 0.68382352941176