In [20]:
'''''
Step 1. Remove the high correlating variables (threshold 0.95)
Step 2. Remove features with low variance (threshold=0.01)
Step 3. Extract relevant features using the Benjamini-Yekutieli procedure (Benjamini Y, Yekutieli D. (2001))
Step 4. Endpoint Y (ICD non-benefit) can be changed to a different endpoint, e.g. appropriate ICD-therapy or all-cause mortality
Step 5. The output of the Benjamini-Yekutieli procedure provides a list of 250 feature names, these are saved
Author: MZH Kolk, 2022 Oct
'''''
import pandas as pd
import numpy as np

#Load dataset and outcome
X = pd.read_csv('X_internal_3Y_NS.csv', index_col=0)
y = pd.read_csv('y_internal_3Y_NS.csv',index_col=0)
y.drop(['Python_ID_Index'], axis=1, inplace=True)
X.drop(['Python_ID_Index'], axis=1, inplace=True)
y = y['ICDnonbenefit_3Y']
y = y.squeeze()

#Select the time-series variables from the dataset
X_ecg = X.iloc[:, 50:]
#Select the clinical variables from the dataset
X_clin = X.iloc[:, :50]

print("Number of variables with selected leads", X_ecg.shape)

# Filter (remove) all but one features with correlation of 0.95 or more
from correlation_script import *
X_ecg, to_drop = drop_input_corr_columns(X_ecg, 0.95)
to_drop = list(to_drop)
print("Number of variables with high correlarity removed", X_ecg.shape)

# Filter variables with high variance
from sklearn.feature_selection import VarianceThreshold
v_threshold = VarianceThreshold(threshold=0.01)
X_ecg_variancethreshold = v_threshold.fit_transform(X_ecg)
variance_column = [column for column in X_ecg.columns
                    if column not in X_ecg.columns[v_threshold.get_support()]]

X_ecg = X_ecg.drop(variance_column,axis=1)
print("Number of variables with variance removed", X_ecg.shape)

# Extract relevant features
from correlation_script import *
X_ecg = extract_relevant_features(X_ecg, y, 250)
print(X_ecg.shape)
relevant_features = list(X_ecg)

#Get list of features that are to be used
list_X_ecg = list(X_ecg)
print("Number of features", len(list_X_ecg))


Number of variables with selected leads (1010, 9384)
Number of variables with high correlarity removed (1010, 7035)
Number of variables with variance removed (1010, 6779)
(1010, 250)
Number of features 250


In [21]:
'''''
Save the list of 250 features (pickle)
'''''
import pickle
with open("list_X_ecg_3YNB_NS.bin", "wb") as output:
    pickle.dump(list_X_ecg, output)