In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
import pickle

In [3]:
DATA_PATH = Path.cwd()/'Data'

In [4]:
RAND_STATE = 0

#### Creating test and train sets

In [5]:
full_df = pd.read_csv(DATA_PATH/'Final_data.csv', low_memory=False)
full_df.drop(labels='AlkPhos_UL', inplace=True, axis=1, errors='ignore')

In [6]:
#Adding categorical age column for proportional/stratified train/test split
AGE_GROUP_AMOUNT = 8
full_df['AGE_GROUP'] = pd.cut(full_df['AGE'], bins=AGE_GROUP_AMOUNT,
                              labels=range(AGE_GROUP_AMOUNT))

#Making train test split with proportional age groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1,
                               random_state=RAND_STATE)
for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
    strat_train_set = full_df.iloc[train_index]
    strat_test_set = full_df.iloc[test_index]

In [7]:
full_transform = ColumnTransformer([
    ('encoder', OrdinalEncoder(), ['GENDER']),
    ('drop', 'drop', ['SEQN', 'AGE_GROUP']),
    ('scaler', StandardScaler(), list(strat_train_set.columns)[2:-2]),
    ('passthrough', 'passthrough', ["AGE"]),
])

trans_train = full_transform.fit_transform(strat_train_set)

In [8]:
train = pd.DataFrame(full_transform.fit_transform(strat_train_set), 
                 columns=strat_train_set.columns[1:-1])
test = pd.DataFrame(full_transform.fit_transform(strat_test_set), 
                 columns=strat_train_set.columns[1:-1])

In [9]:
train.to_csv('Data/train_data.csv', index=False)
test.to_csv('Data/test_data.csv', index=False)

In [10]:
full_df = pd.concat([train, test]).reset_index(drop=True)

In [11]:
full_df.to_csv('Data/full_data.csv', index=False)

#### Creating zipped biomarkers and their units of measurement

In [12]:
a='''Albumin**                        [                    ] 35 - 52 g/l
Glucose**                        [                    ] 3.9 - 5.8 mmole/l
Urea**(BUN)                      [                    ] 2.5 - 6.4 mmole/l
Cholesterol**                    [                    ] 3.37 - 5.96 mmole/l
Protein total**                  [                    ] 64 - 83 g/l
Sodium**                         [                    ] 136 - 146 mmole/l
Creatinine**                     [                    ] 53 - 97 mmole/l
Hemoglobin**                     [                    ] 11.7 - 15.5 g/dl
Bilirubin total                  [                    ] 1.7 - 21 mcmole/l
Triglycerides                    [                    ] 0.68 - 6 mmole/l
HDL Cholesterol                  [                    ] < 3.3 mmole/l
LDL cholesterol (by Friedewald)  [                    ] 1.81- 4.04 mmole/l
Calcium                          [                    ] 2.15 - 2.65 mmole/l
Potassium                        [                    ] 3.4 - 5.1 mmole/l
Hematocrit                       [                    ] 37 - 50 %
MCHC                             [                    ] 31.5 - 35.7 g/dL
MCV                              [                    ] 82 - 95 fl
Platelets                        [                    ] 150 - 450 10^3 /mcl
Erythrocytes (RBC)               [                    ] 3.5 - 5.5 10^6 /mcl'''

In [13]:
lines = a.splitlines()
lines = [l.split('[') for l in lines]
lines = [(b.strip(), m.strip()) for b, m in lines]

In [14]:
biomarkers = [l[0] for l in lines]
measures = [l[1][2:] for l in lines]

In [15]:
measures[4] = '6.4 - 8.3 g/dl'

In [16]:
z = list(zip(biomarkers, measures))

In [17]:
s = 'Biomarker,Unit\n'
for b, u in z:
    s += b+','+u+'\n'

In [18]:
with open("Data/non_NHANES/markers_units.csv", "w") as text_file:
    text_file.write(s.strip())