In [583]:
import pandas as pd
import numpy as np

data = pd.read_csv('IFR_Extract_with_selected_columns_15-5-23.csv')

obreak_date = pd.to_datetime(data.obreak_date)
datebone = pd.to_datetime(data.datebone)
y = ( abs( datebone - obreak_date))
X = data.drop(["obreak_date","datebone"],axis=1)
selectedColumns = [ 'PatientAge', "PatientGender",'parentbreak', 'alcohol',
                'arthritis', 'diabetes',
                'oralster', 'smoke', 'obreak']



In [584]:
dropList = []
for i in data:
    if data[i].dtypes == 'O':
        dropList.append(data[i].name)
dropList.append("CompletedSurveyId")
dropList.append("PatientId")
X = data.drop(dropList,axis=1)
X.fillna(0,inplace=True)
y = pd.DataFrame({"time":y})



In [585]:
y.time

y['event'] = y.time.apply(lambda x: x.days != 0 )
structured_array = y.to_records(index=False)

swapped = pd.DataFrame({
    "event": y.event,
    "time": y.time.apply(lambda x: x.days)
})
(swapped.event).value_counts()

True     792
False      8
Name: event, dtype: int64

In [586]:
swapped.event = swapped.event.astype(bool)
swapped.event
structured_array = np.rec.array(swapped.to_records(index=False))

In [587]:
swapped

Unnamed: 0,event,time
0,True,524
1,True,2046
2,True,15455
3,True,4354
4,True,2207
...,...,...
795,True,579
796,True,5109
797,True,2125
798,True,518


In [588]:
mergedBeforeEncoding = pd.concat([X[selectedColumns],swapped],axis=1)
mergedBeforeEncoding

Unnamed: 0,PatientAge,PatientGender,event,time
0,53,1,True,524
1,85,1,True,2046
2,90,1,True,15455
3,81,1,True,4354
4,60,1,True,2207
...,...,...,...,...
795,83,1,True,579
796,60,1,True,5109
797,76,2,True,2125
798,61,1,True,518


In [589]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
import random

# Define the number of synthetic samples
num_samples = 200

# Get the column types for each column in mergedBeforeEncoding
column_types = {}
for column in mergedBeforeEncoding.columns:
    column_types[column] = mergedBeforeEncoding[column].dtype

# Shuffle the feature names
feature_names = list(mergedBeforeEncoding.columns)
random.shuffle(feature_names)

# Initialize an empty DataFrame to store the selected features and their performance
selected_features = pd.DataFrame(columns=["Feature"])

# Create a synthetic data DataFrame with the same columns as mergedBeforeEncoding
synthetic_data = pd.DataFrame(columns=mergedBeforeEncoding.columns)

# Generate synthetic data for each feature
for feature in feature_names:
    column_type = column_types[feature]

    if column_type == bool:
        synthetic_data[feature] = np.random.choice([False, True], size=num_samples)
    else:
        # Sample values from the existing data to maintain the distribution
        existing_data_values = mergedBeforeEncoding[feature].dropna().values
        synthetic_data[feature] = np.random.choice(existing_data_values, size=num_samples)

    synthetic_data[feature] = synthetic_data[feature].astype(column_type)

# Add additional columns to the synthetic data
synthetic_data["obreak"] = 1
synthetic_data["event"] = False
synthetic_data["time"] = 0

augmented_data = pd.concat([mergedBeforeEncoding, synthetic_data], ignore_index=True)



# Store the selected features
selected_features["Feature"] = feature_names



In [590]:
cat_features = ['parentbreak', 'alcohol',
                'arthritis', 'diabetes',
                'oralster', 'smoke', 'obreak'
                # These features were determined to apply minimal impact even
                # 'respdisease', 'hbp','heartdisease',
                # 'ptunsteady', 'wasfractdue2fall', 'cholesterol',
                # 'ptfall', 'shoulder', 'wrist', 'bmdtest_10yr_caroc'
                ]
for feature in cat_features:
    if augmented_data is not None:
        if feature in augmented_data.columns:
            cat_one_hot = pd.get_dummies(augmented_data[feature], prefix=f'{feature}', drop_first=False)
            augmented_data = augmented_data.drop(feature, axis=1)
            augmented_data = augmented_data.join(cat_one_hot)

In [591]:
augmented_data.columns

Index(['PatientAge', 'PatientGender', 'event', 'time', 'obreak_1.0'], dtype='object')

In [592]:
X = augmented_data.drop(['event','time'],axis=1)
y = augmented_data[['event','time']]

In [593]:
X

Unnamed: 0,PatientAge,PatientGender,obreak_1.0
0,53,1,0
1,85,1,0
2,90,1,0
3,81,1,0
4,60,1,0
...,...,...,...
995,52,1,1
996,78,2,1
997,76,1,1
998,76,2,1


In [594]:
y = np.rec.array(y.to_records(index=False))


In [595]:
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create an instance of the RandomSurvivalForest model
model = RandomSurvivalForest(random_state=20)

# Fit the model on the training data
model.fit(X_train, y_train)

# Calculate the baseline performance
baseline_score = concordance_index(y_test['time'], -model.predict(X_test), y_test['event'])

# Initialize an array to store the feature importances
feature_importances = np.zeros(X_train.shape[1])

# Perform feature importance calculation
for i in range(X_train.shape[1]):
    # Make a copy of the test set
    X_permuted = X_test.copy()

    # Permute the values of the feature at index i
    X_permuted.iloc[:, i] = np.random.permutation(X_permuted.iloc[:, i])

    # Calculate the permuted score
    permuted_score = concordance_index(y_test['time'], -model.predict(X_permuted), y_test['event'])

    # Calculate the feature importance as the difference between the baseline score and permuted score
    feature_importances[i] = baseline_score - permuted_score

# Normalize the feature importances
feature_importances /= np.sum(feature_importances)

# Print the feature importances
feature_names = X_train.columns

#for feature_name, importance in zip(feature_names, feature_importances):
    #print(f"Feature: {feature_name}, Importance: {importance}")

df = pd.DataFrame()
for name, importance in zip(feature_names, feature_importances):
    df = pd.concat([df, pd.DataFrame({'Feature Name': [name], 'Feature Importance': [importance]})], ignore_index=True)

df = df.sort_values('Feature Importance', ascending=False)

df

# Calculate the c-index on the test set
c_index = concordance_index(y_test['time'], -model.predict(X_test), y_test['event'])
print("C-index:", c_index)

C-index: 0.4993591007690791


In [596]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

# Initialize the GBST survival regressor
gbst = GradientBoostingSurvivalAnalysis()

# Fit the model to the training data
gbst.fit(X_train, y_train)

# Predict the survival times for the testing data
survival_times = gbst.predict(X_test)

# Compute the concordance index to evaluate the model performance
c_index = concordance_index(y_test['time'], -survival_times, y_test['event'])
print(f"Concordance Index: {c_index}")

Concordance Index: 0.510441727469927


In [381]:
import matplotlib.pyplot as plt

pred_surv = model.predict_survival_function(X.loc[:15])
time_points = np.arange(1, 1000)
for i, surv_func in enumerate(pred_surv):
    plt.step(time_points, surv_func(time_points), where="post",
             label="Sample %d" % (i + 1))
plt.ylabel("est. probability of survival $\hat{S}(t)$")
plt.xlabel("time $t$")
plt.legend(loc="best")

ValueError: x must be within [3.000000; 17689.000000]