# Benchmarks

## Initialize

In [1]:
import os
import math
import pathlib
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.feather as feather
from tqdm.auto import tqdm
from IPython.display import clear_output

import warnings
from lifelines.utils import CensoringType
from lifelines.utils import concordance_index

In [2]:
# import ray
# ray.shutdown()

In [3]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_retina_phewas"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = '220812_test'
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)


name_dict = {
    "predictions_cropratio0.66": "ConvNextSmall(Retina)+MLP_cropratio0.66",
}

partitions = [i for i in range(22)]
partitions

/sc-projects/sc-proj-ukb-cvd


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [4]:
import ray
# ray start --head --port=6379 --num-cpus 64
#ray.init(num_cpus=24, include_dashboard=False)#, dashboard_port=24762, dashboard_host="0.0.0.0", include_dashboard=True)#, webui_url="0.0.0.0"))
ray.init(address='auto')

RayContext(dashboard_url='', python_version='3.9.7', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.32.105.14', 'raylet_ip_address': '10.32.105.14', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-08-25_09-57-05_054645_3933326/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-08-25_09-57-05_054645_3933326/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-08-25_09-57-05_054645_3933326', 'metrics_export_port': 60554, 'gcs_address': '10.32.105.14:6321', 'address': '10.32.105.14:6321', 'node_id': 'aa0f99a02d648e105988a8de7155afbfc253c61e7021ae9fe8b851e0'})

In [5]:
import pandas as pd
endpoints = sorted([l.replace('_prevalent', '') for l in list(pd.read_csv('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retinal_risk/data/220602/endpoints.csv').endpoint.values)])

In [6]:
import glob, os
img_root = '/sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed'
img_visit = 0
img_file_extension = '.png'
eids_with_retinapic = [int(fp.split('/')[-1].split('_')[0]) for fp in sorted( glob.glob(os.path.join(img_root, f'*{img_file_extension}' 
                       if img_file_extension is not None else '*'))) 
                       if f'_{img_visit}_' in fp]
len(eids_with_retinapic)

113122

In [7]:
data_covariates = pd.read_feather(f"{output_path}/data_covariates_full.feather").set_index("eid")

In [8]:
# data_covariates = data_covariates[['age', 'sex', 'ethnic_background']]

In [9]:
AgeSex = ["age", "sex"]

SCORE2 = [
    "age", 
    "sex",
    "smoking_status", # current smoker
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

] 

ASCVD = [
    "age", 
    "sex",
    "ethnic_background",
    "smoking_status", # current smoker
    "diabetes", # diabetes
    "antihypertensives", 
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",
] 

QRISK3 = [
    "age", 
    "sex",
    "ethnic_background",
    "smoking_status", # current smoker
    "bmi",
    "diabetes1", # type 1 diabetes
    "diabetes2", # type 1 diabetes
    "fh_heart_disease",
    "renal_failure", 
    "atrial_fibrillation", 
    "migraine",
    "rheumatoid_arthritis", 
    "systemic_lupus_erythematosus", 
    "schizophrenia", 
    "bipolar_disorder", 
    "major_depressive_disorder", 
    "male_erectile_dysfunction", 
    "antihypertensives", 
    "corticosteroids",
    "psycholeptics",
    "systolic_blood_pressure",
    "cholesterol",
    "hdl_cholesterol",

]

# assert, that all variables are available
covariates_scores = sorted(list(set(AgeSex + SCORE2 + ASCVD + QRISK3)))
if not set(covariates_scores).issubset(data_covariates.columns.to_list()):
    print("Not all required covariates are prepared!", list(set(covariates_scores).difference(data_covariates.columns.to_list())))
else:
    print("Success, all required covariates are prepared!")
    data_covariates = data_covariates[covariates_scores]

Success, all required covariates are prepared!


In [10]:
variables_cont = data_covariates.select_dtypes(include=["int32", "float32", "float64"]).columns.to_list()#dtypes.to_frame().rename(columns={0:"dtype"}).query("dtype!='bool'")
variables_cat = data_covariates.select_dtypes(include=["category"]).columns.to_list()
variables_bool = data_covariates.select_dtypes(include=["bool"]).columns.to_list()#dtypes.to_frame().rename(columns={0:"dtype"}).query("dtype!='bool'")
print("Cont: ", variables_cont)
print("Cat: ", variables_cat)
print("Bool: ", variables_bool)

variables_to_norm = variables_cont + endpoints

Cont:  ['age', 'bmi', 'cholesterol', 'hdl_cholesterol', 'systolic_blood_pressure']
Cat:  ['ethnic_background', 'sex', 'smoking_status']
Bool:  ['antihypertensives', 'atrial_fibrillation', 'bipolar_disorder', 'corticosteroids', 'diabetes', 'diabetes1', 'diabetes2', 'fh_heart_disease', 'major_depressive_disorder', 'male_erectile_dysfunction', 'migraine', 'psycholeptics', 'renal_failure', 'rheumatoid_arthritis', 'schizophrenia', 'systemic_lupus_erythematosus']


In [11]:
in_path = pathlib.Path(f"{experiment_path}/loghs")
in_path.mkdir(parents=True, exist_ok=True)

out_path = f"{experiment_path}/coxph/input"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

In [12]:
in_path

PosixPath('/sc-projects/sc-proj-ukb-cvd/results/projects/22_retina_phewas/data/220812_test/loghs')

In [13]:
models = [f.name for f in in_path.iterdir() if f.is_dir() and "ipynb_checkpoints" not in str(f)]
for model in models:
    pathlib.Path(os.path.join(out_path, model)).mkdir(parents=True, exist_ok=True)
    for p in partitions:
        pathlib.Path(os.path.join(out_path, model, str(p))).mkdir(parents=True, exist_ok=True)
models

['ImageTraining_[]_ConvNeXt_MLPHead_predictions_cropratio0.66']

In [14]:
from sklearn.preprocessing import StandardScaler
import miceforest
import pickle
import zstandard
import glob
import os

# def find_retina_eid_intersection():
#     img_root = '/sc-projects/sc-proj-ukb-cvd/data/retina/preprocessed/preprocessed'
#     img_visit = 0
#     img_file_extension = '.png'
#     eids_with_retinapic = [int(fp.split('/')[-1].split('_')[0]) for fp in sorted( glob.glob(os.path.join(img_root, f'*{img_file_extension}' 
#                            if img_file_extension is not None else '*'))) 
#                            if f'_{img_visit}_' in fp]
#     len(eids_with_retinapic)
    
#     d = []
#     for endpoint in tqdm(endpoints):
#         s = data_outcomes[f'{endpoint}_event'].loc[np.intersect1d(eids_dict[endpoint], eids_with_retinapic)]   # .loc[eids_dict[endpoint]]
#         n = s.sum()
#         freq = s.mean()
#         d.append({"endpoint": endpoint, "eligable":len(np.intersect1d(eids_dict[endpoint], eids_with_retinapic)), "n": n, "freq": freq})
        
#     endpoints_freqs = pd.DataFrame().from_dict(d)
#     endpoints_ds = endpoints_freqs.query("n>100").sort_values("endpoint")#.reset_index(drop=True)

#     return endpoints_ds # TODO

def read_merge_data(fp_in, split, data_covariates):
    temp = pd.read_feather(f"{fp_in}/{split}.feather").set_index("eid")
    if 'split' in temp.columns:
        temp.drop('split', axis=1, inplace=True)
    temp = temp.merge(data_covariates, left_index=True, right_index=True, how="left")
    
    return temp   

def load_pickle(fp):
    with open(fp, "rb") as fh:
        dctx = zstandard.ZstdDecompressor()
        with dctx.stream_reader(fh) as decompressor:
            data = pickle.loads(decompressor.read())
    return data
    
def save_pickle(data, data_path):
    with open(data_path, "wb") as fh:
        cctx = zstandard.ZstdCompressor()
        with cctx.stream_writer(fh) as compressor:
            compressor.write(pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL))
            
def get_variable_schema(data):
    
    missing = data.columns[data.isna().any()].to_list()
    
    print('Missing columns:', missing)
    
    variable_schema = {}
    for m in missing:
        variable_schema[m] = [x for x in data.columns if x != m]
    
    return variable_schema

def tune_imputer(data):
    
    variable_schema = get_variable_schema(data)
        
    kernel = miceforest.ImputationKernel(data,
                                         datasets=1,
                                         random_state=42)#, train_nonmissing=True)
    
    best_hps, losses = kernel.tune_parameters(dataset=0, n_jobs=96, optimization_steps=5, verbose=True) # add bootstrrapping! 
  
    return best_hps

def get_imputer_hps(data_covariates, model, partition, samples):

    fp_in = f"{in_path}/{model}/{partition}"
    fp_out = f"{out_path}/{model}" # fp_out = f"{out_path}/{model}/"
    
    temp = read_merge_data(fp_in, "train", data_covariates.sample(samples))
    
    print("tune hps")
    best_hps = tune_imputer(temp)
    save_pickle(best_hps, f"{fp_out}/imputer_best_hps.p")
    
    return best_hps

def fit_imputer(data, best_hps=None):
    
    variable_schema = get_variable_schema(data)
        
    kernel = miceforest.ImputationKernel(data,
                                         datasets=1,
                                         random_state=42)#, train_nonmissing=True)

    # Run the MICE algorithm for 3 iterations
    kernel.mice(5, n_jobs=16, 
                variable_parameters=best_hps,
                verbose=True)
    
    return kernel
    
@ray.remote
def scale_encode_save_feather(partition, split, temp_df, scaler, variables_cont, variables_cat, fp_out):
    print(partition, split, f"scale {split}")
    temp_df[variables_cont] = scaler.transform(temp_df[variables_cont].values)

    print(partition, split, f"onehotencode {split}")
    temp_df = pd.get_dummies(temp_df, columns=variables_cat, prefix=variables_cat)

    # save imputed and standardized file
    print(partition, split, f"save {split}")
    temp_df.reset_index(drop=False).to_feather(f"{fp_out}/{split}.feather")
    

def impute_norm_variables(data_covariates, model, partition, variables_cont, variables_cat, samples):

    fp_in = f"{in_path}/{model}/{partition}"
    fp_out = f"{out_path}/{model}/{partition}"
      
    if pathlib.Path(fp_in).is_dir():
        if not pathlib.Path(fp_out).is_dir():
            pathlib.Path(fp_out).mkdir(parents=True, exist_ok=True)
            
    for split in tqdm(["train", "valid", "test"]):
        
        print(partition, split, "read and merge data")
        temp = read_merge_data(fp_in, split, data_covariates)
        
        temp = temp.loc[np.intersect1d(temp.index.values, eids_with_retinapic)]

        
        if split=="train": 
            # fit and save imputer
            print(partition, split, "fit imputer")
            print(partition, split, "fit imputer: load hps")
            best_hps = load_pickle(f"{out_path}/{model}/imputer_best_hps.p")
            print(partition, split, "fit imputer: fit imputer")
            print(temp.isna().sum()
                 )
            imputer = fit_imputer(temp.sample(samples),
#                                   best_hps
                                 )
            print(partition, split, "fit imputer: save imputer")
            save_pickle(imputer, f"{fp_out}/imputer.p")
            
            # check imputer and log results
#             print(partition, split, "check imputer: plot distributions")
#             print(imputer.plot_imputed_distributions(wspace=0.3,hspace=0.3))
            #plt.savefig(f"{fp_out}/imputed_dists.png")
            
        # apply imputer and scaler
        print(partition, split, f"impute {split}")
        
        if temp.isna().sum().sum() > 0:
            temp = imputer.impute_new_data(new_data=temp, verbose=True).complete_data(0)            
            
        
        if split=="train": 
            
            # fit and save standardscaler
            print(partition, split, "fit scaler")
            scaler = StandardScaler(with_mean=True, with_std=True, copy=True).fit(temp[variables_cont].values)
            save_pickle(scaler, f"{fp_out}/scaler.p")
            
        scale_encode_save_feather.remote(partition, split, temp, scaler, variables_cont, variables_cat, fp_out)
        
    return True

In [15]:
# only execute once to make sure we have a good set of lightgmb parameters
#get_imputer_hps(data_covariates, models[0], partitions[0], variables_to_norm, samples=10000)

In [16]:
#impute_norm_variables(data_covariates, models[0], partitions[0], variables_to_norm, 10000)

In [17]:
def norm_logh_and_extra(data_covariates, variables_cont, variables_cat, samples):
    
    print(f"Tune and fit imputation with {samples} samples")
    
#     for model in models:
        # instead of models[0]
#         hps_path = f"{out_path}/{model}/imputer_best_hps.p"
#         if not pathlib.Path(hps_path).is_file():
#             print(f"No HPs found, estimating new HPs...")
#             get_imputer_hps(data_covariates, model, partitions[0], samples)
#         else:
#             print(f"Use {hps_path}")

    progress = []
    for model in models:
        for partition in tqdm(partitions):
            progress.append(impute_norm_variables(data_covariates, 
                                                  model, partition,
                                                  variables_cont, 
                                                  variables_cat,
                                                  samples))
    #[ray.get(s) for s in tqdm(progress)]

In [18]:
# encode_categorical

In [19]:
norm_logh_and_extra(data_covariates, variables_cont, variables_cat, 15000)

Tune and fit imputation with 15000 samples


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 train read and merge data
0 train fit imputer
0 train fit imputer: load hps
0 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  289
systemic_lupus_erythematosus      0
systolic_blood_pressure         181
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
3  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
4  | systolic_blood

  0%|          | 0/3 [00:00<?, ?it/s]

1 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 0 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 0 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 0 test save test
1 train fit imputer
1 train fit imputer: load hps
1 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  291
systemic_lupus_erythematosus      0
systolic_blood_pressure         181
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systol

  0%|          | 0/3 [00:00<?, ?it/s]

2 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 1 test save test
2 train fit imputer
2 train fit imputer: load hps
2 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  294
systemic_lupus_erythematosus      0
systolic_blood_pressure         186
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
3  | systolic_blood_pressure | bmi | smoking_status | 

  0%|          | 0/3 [00:00<?, ?it/s]

3 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 2 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 2 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 2 test save test
3 train fit imputer
3 train fit imputer: load hps
3 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  297
systemic_lupus_erythematosus      0
systolic_blood_pressure         187
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  | systol

  0%|          | 0/3 [00:00<?, ?it/s]

4 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 3 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 3 test save test
4 train fit imputer
4 train fit imputer: load hps
4 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  298
systemic_lupus_erythematosus      0
systolic_blood_pressure         187
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholest

  0%|          | 0/3 [00:00<?, ?it/s]

5 train read and merge data
5 train fit imputer
5 train fit imputer: load hps
5 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  299
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
3  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
4  | systolic_blood

  0%|          | 0/3 [00:00<?, ?it/s]

6 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 5 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 5 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 5 test save test
6 train fit imputer
6 train fit imputer: load hps
6 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  299
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systol

  0%|          | 0/3 [00:00<?, ?it/s]

7 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 6 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 6 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 6 test save test
7 train fit imputer
7 train fit imputer: load hps
7 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  300
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  | systol

  0%|          | 0/3 [00:00<?, ?it/s]

8 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 7 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 7 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 7 test save test
8 train fit imputer
8 train fit imputer: load hps
8 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  300
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  | systol

  0%|          | 0/3 [00:00<?, ?it/s]

9 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 8 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 8 test save test
9 train fit imputer
9 train fit imputer: load hps
9 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  301
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholest

  0%|          | 0/3 [00:00<?, ?it/s]

10 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 9 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 9 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 9 test save test
10 train fit imputer
10 train fit imputer: load hps
10 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  301
systemic_lupus_erythematosus      0
systolic_blood_pressure         188
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | sy

  0%|          | 0/3 [00:00<?, ?it/s]

11 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 10 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 10 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 10 test save test
11 train fit imputer
11 train fit imputer: load hps
11 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  302
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

[2m[36m(scale_encode_save_feather pid=375284)[0m 11 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 11 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 11 test save test
12 train read and merge data
12 train fit imputer
12 train fit imputer: load hps
12 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  302
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

13 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 12 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 12 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 12 test save test
13 train fit imputer
13 train fit imputer: load hps
13 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  303
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

14 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 13 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 13 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 13 test save test
14 train fit imputer
14 train fit imputer: load hps
14 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  305
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

[2m[36m(scale_encode_save_feather pid=375284)[0m 14 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 14 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 14 test save test
15 train read and merge data
15 train fit imputer
15 train fit imputer: load hps
15 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  305
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1198, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

16 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 15 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 15 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 15 test save test
16 train fit imputer
16 train fit imputer: load hps
16 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  307
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

17 train read and merge data
17 train fit imputer
17 train fit imputer: load hps
17 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  307
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
3  | systolic_blood_pressure | smoking_status | bmi | ethnic_background | cholesterol | hdl_cholesterol
4  | systolic_b

  0%|          | 0/3 [00:00<?, ?it/s]

18 train read and merge data
18 train fit imputer
18 train fit imputer: load hps
18 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  308
systemic_lupus_erythematosus      0
systolic_blood_pressure         189
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
3  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
4  | systolic_b

  0%|          | 0/3 [00:00<?, ?it/s]

19 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 18 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 18 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 18 test save test
19 train fit imputer
19 train fit imputer: load hps
19 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  313
systemic_lupus_erythematosus      0
systolic_blood_pressure         190
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

20 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 19 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 19 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 19 test save test
20 train fit imputer
20 train fit imputer: load hps
20 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  313
systemic_lupus_erythematosus      0
systolic_blood_pressure         191
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

  0%|          | 0/3 [00:00<?, ?it/s]

21 train read and merge data
[2m[36m(scale_encode_save_feather pid=375284)[0m 20 test scale test
[2m[36m(scale_encode_save_feather pid=375284)[0m 20 test onehotencode test
[2m[36m(scale_encode_save_feather pid=375284)[0m 20 test save test
21 train fit imputer
21 train fit imputer: load hps
21 train fit imputer: fit imputer
OMOP_4306655                      0
phecode_002                       0
phecode_002-1                     0
phecode_003                       0
phecode_004                       0
                               ... 
schizophrenia                     0
sex                               0
smoking_status                  315
systemic_lupus_erythematosus      0
systolic_blood_pressure         192
Length: 1197, dtype: int64
Missing columns: ['bmi', 'cholesterol', 'ethnic_background', 'hdl_cholesterol', 'smoking_status', 'systolic_blood_pressure']
Dataset 0
1  | systolic_blood_pressure | bmi | smoking_status | ethnic_background | cholesterol | hdl_cholesterol
2  |

In [20]:
import sklearn
sklearn.__version__

'1.0.1'

In [21]:
1+1

2