In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from pickle import dump

In [2]:
def split_transform_save(input_path = Path('../nivis_data/data_bio_albcr.csv'),
                         output_path = Path('../nivis_data/'),
                         target_names = ['AGE'],
                         cat_names = ['GENDER'],
                         drop_names = ['SEQN', 'AlkPhos_UL'],
                         age_groups = 8,
                         rand_state = 0,
                         test_size = 0.1):
                    
    '''Load the data, drop unused features, split it by a statified sclearn
    split, scale, encode, and pass continious, categorical, and target
    features respectively save the transformer and full/test/train datasets'''
                             
    #Loading and preprocessing a dataset
    full_df = pd.read_csv(input_path, low_memory=False)
    full_df.drop(drop_names, inplace=True, axis=1)
    cont_names = [x for x in full_df.columns if x not in cat_names + target_names]

    #Adding categorical age column for a stratified train/test split
    full_df['AGE_GROUP'] = pd.cut(full_df['AGE'],
                                  bins=age_groups,
                                  labels=range(age_groups))

    #Making train test split with proportional age groups
    split = StratifiedShuffleSplit(n_splits=1,
                                   test_size=test_size,
                                   random_state=rand_state)
    
    for train_index, test_index in split.split(full_df, full_df['AGE_GROUP']):
        strat_train_set = full_df.iloc[train_index].drop('AGE_GROUP', axis=1)
        strat_test_set = full_df.iloc[test_index].drop('AGE_GROUP', axis=1)

    full_transform = ColumnTransformer([
        ('encoder', OrdinalEncoder(), [0]),
        ('scaler', StandardScaler(), slice(1, -1)),
        ('passthrough', 'passthrough', [-1]),
    ])

    train_df = pd.DataFrame(full_transform.fit_transform(strat_train_set), 
                     columns=cat_names+cont_names+target_names)
    test_df = pd.DataFrame(full_transform.transform(strat_test_set), 
                     columns=cat_names+cont_names+target_names)
    full_df = pd.concat([train_df, test_df]).reset_index(drop=True)

    train_df.to_csv(output_path/'train_data.csv', index=False)
    test_df.to_csv(output_path/'test_data.csv', index=False)
    full_df.to_csv(output_path/'full_data.csv', index=False)
    dump(full_transform, open('transformer.pkl', 'wb'))

In [3]:
def make_biomarkers_units(to_path=Path('non_NHANES/markers_units.csv')):
    '''Create a zipped list of biomarkers and their units of emasurements'''
    a='''Albumin**                   [                ] 35 - 52 g/l
    Glucose**                        [                ] 3.9 - 5.8 mmole/l
    Urea**(BUN)                      [                ] 2.5 - 6.4 mmole/l
    Cholesterol**                    [                ] 3.37 - 5.96 mmole/l
    Protein total**                  [                ] 64 - 83 g/l
    Sodium**                         [                ] 136 - 146 mmole/l
    Creatinine**                     [                ] 53 - 97 mmole/l
    Hemoglobin**                     [                ] 11.7 - 15.5 g/dl
    Bilirubin total                  [                ] 1.7 - 21 mcmole/l
    Triglycerides                    [                ] 0.68 - 6 mmole/l
    HDL Cholesterol                  [                ] < 3.3 mmole/l
    LDL cholesterol (by Friedewald)  [                ] 1.81- 4.04 mmole/l
    Calcium                          [                ] 2.15 - 2.65 mmole/l
    Potassium                        [                ] 3.4 - 5.1 mmole/l
    Hematocrit                       [                ] 37 - 50 %
    MCHC                             [                ] 31.5 - 35.7 g/dL
    MCV                              [                ] 82 - 95 fl
    Platelets                        [                ] 150 - 450 10^3 /mcl
    Erythrocytes (RBC)               [                ] 3.5 - 5.5 10^6 /mcl'''

    lines = a.splitlines()
    lines = [l.split('[') for l in lines]
    lines = [(b.strip(), m.strip()) for b, m in lines]

    biomarkers = [l[0] for l in lines]
    units = [l[1][2:] for l in lines]
    units[4] = '6.4 - 8.3 g/dl'

    s = 'Biomarker,Unit\n'
    for b, u in list(zip(biomarkers, units)):
        s += b+','+u+'\n'

    with open(to_path, 'w') as text_file:
        text_file.write(s.strip())

In [4]:
def make_samples():

    MALE69 = [1.0, 37.0, 5.16, 3.89, 4.73, 5.9, 140.0, 88.4, 14.4, 18.81, 1.11,
              1.05, 3.13, 2.05, 4.4, 44.2, 32.6, 91.0, 188.0, 4.86, 69.0]
    MALE40=[1.0, 49.21, 5.28, 7.3, 7.04, 7.0, 143.0, 100.5, 16.5, 17.2, 1.35,
            1.6, 4.83, 2.48, 4.7, 46.7, 35.4, 86.5, 177.0, 5.41, 40.0]
    MALE29 = [1.0, 47.64, 5.06, 6.0, 5.35, 8.2, 139.0, 72.7, 13.6, 10.9, 0.74,
              1.38, 3.63, 2.62, 4.7, 40.8, 33.3, 73.2, 206.0, 5.57, 29.0]
    FEMALE31 = [0.0, 41, 4.27, 3, 3.24, 6.9, 140, 53.04, 13.3, 22.23, 0.56,
                0.92, 1.85, 2.12, 3.7, 38.4, 34.6, 88.3, 164, 4.35, 31.0]
    FEMALE61 = [0.0, 38.72, 5.65, 6.4, 6.54, 6.4, 141, 67, 13.5, 7.2, 1.48,
                1.37, 4.5, 2.33, 4.2, 37.3, 36.2, 84.4, 437, 4.42, 61.0]

    people = [FEMALE31, FEMALE61, MALE29, MALE40, MALE69]

    df = pd.read_csv('test_data.csv')
    df = df[0:len(people)].copy()
    for i, biomarkers in enumerate(people):
        df[i:i+1] = biomarkers
    
    df.to_csv('non_NHANES/samples_from_aging.csv', index=False)

In [5]:
split_transform_save()

In [6]:
make_biomarkers_units()

In [7]:
make_samples()