In [62]:
'''''
This notebook describes the standardisation of the ECG features and the split of internal and external datasets
Step 1. Get the original dataframe that contains the extracted ECG features
Step 2. Select the patients with sufficient follow-up for the label the model is developed for
Author: MZH Kolk, 2022 Oct
'''''

import pandas as pd
import numpy as np
import warnings

#Import the dataframe comprising clinical data (standardised) and tsfresh data (unstandardised)---> only ICM/DCM/NICM
dff = pd.read_csv('baselinedata_newselection.csv', index_col=0)

ts_fresh = pd.read_csv('extracted_featuresefficient.csv',  index_col=0)
ts_fresh['Python_ID_Index'] = ts_fresh.reset_index().index
merged_set = pd.merge(dff, ts_fresh, how="left", on=["Python_ID_Index"])
merged_set.to_csv('data_newselection.csv' )

dff = merged_set
#Get a list of patients with full FU for the selected label (later will be used of row selection)
lst = dff.index[(dff['MtillDeath_censored'] > 36) | (dff['ICDnonbenefit_3Y']==1) | (dff['AppropriateTherapy3Y']==1)].tolist()
print("Patients with 1Y follow-up", len(dff.index[(dff['MtillDeath_censored'] > 12) | \
                      (dff['ICDnonbenefit_1Y']==1) | (dff['AppropriateTherapy1Y']==1)].tolist()))
print("Patients with 2Y follow-up", len(dff.index[(dff['MtillDeath_censored'] > 24) | \
                      (dff['ICDnonbenefit_2Y']==1) | (dff['AppropriateTherapy2Y']==1)].tolist()))
print("Patients with 3Y follow-up", len(dff.index[(dff['MtillDeath_censored'] > 36) | \
                      (dff['ICDnonbenefit_3Y']==1) | (dff['AppropriateTherapy3Y']==1)].tolist()))
print("Patients with 4Y follow-up", len(dff.index[(dff['MtillDeath_censored'] > 48) | \
                      (dff['ICDnonbenefit_4Y']==1) | (dff['AppropriateTherapy4Y']==1)].tolist()))
df = dff
print(df.shape)
print(dff.shape)
print(len(lst))

Patients with 1Y follow-up 1185
Patients with 2Y follow-up 1103
Patients with 3Y follow-up 1010
Patients with 4Y follow-up 922
(1269, 9481)
(1269, 9481)
1010


In [64]:
'''''
Split the dataset into an internal and external cohort. The external cohort is kept isolated for until external validation

Step 1. Patient ID <1798 are the internal dataset (Hospital A)
Step 2. Patient ID > 1797 are the external dataset (Hospital B)
Step 3. The X and y dataframes are split based on the patient ID

Author: MZH Kolk, 2022 Oct
'''''
pd.options.mode.chained_assignment = None

#Save the X dataset
X = x
X.to_csv('X_3Y_NS.csv')

#Save the y dataset
y = y_listed
y.to_csv('y_3Y_NS.csv')

#Convert the Patient ID to integer
X['Python_ID_Index'] = X['Python_ID_Index'].astype(int)

#Function to split the dataset into an internal and external cohort based on the patientID
def External_cohort(dfObj):
    if (dfObj['Python_ID_Index'] < 1798):
        return  'True' #DISTANT
    elif (dfObj['Python_ID_Index'] > 1797):
        return 'False' #DEEPRISK

#Make a label (bool) for the internal X dataset and X external dataset
X['Python_ID_Index'] = X.apply(External_cohort, axis = 1)
X_internal = X[X["Python_ID_Index"] == 'True']
X_internal.drop(['Python_ID_Index'], axis=1, inplace=True)
X_external = X[X["Python_ID_Index"] == 'False']
X_external.drop(['Python_ID_Index'], axis=1, inplace=True)

#Make a label (bool) for the internal y dataset and y external dataset
y['Python_ID_Index'] = y.apply(External_cohort, axis = 1)
y_internal = y[y["Python_ID_Index"] == 'True']
y_internal.drop(['Python_ID_Index'], axis=1, inplace=True)
y_external = y[y["Python_ID_Index"] == 'False']
y_external.drop(['Python_ID_Index'], axis=1, inplace=True)

#Save the internal and external datasets
X_internal.to_csv('X_internal_3Y_NS.csv')
X_external.to_csv('X_external_3Y_NS.csv')
y_external.to_csv('y_external_3Y_NS.csv')
y_internal.to_csv('y_internal_3Y_NS.csv')

print(y_internal.shape)
print(y_external.shape)
print(X_external.shape)
print(X_internal.shape)


(550, 16)
(460, 16)
(460, 9434)
(550, 9434)
