In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, LeakyReLU, Dropout, BatchNormalization
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [4]:
df = pd.read_csv('../Data/df_transformed.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
df.columns

Index(['LBXTC', 'RIAGENDR', 'RIDRETH3', 'RIDAGEYR', 'BMXWT', 'BMXHT', 'BMXBMI',
       'BPXPLS', 'BPXPULS', 'DR1TKCAL', 'DR1TPROT', 'DR1TCARB', 'DR1TSUGR',
       'DR1TFIBE', 'DR1TTFAT', 'DR1TSFAT', 'DR1TMFAT', 'DR1TPFAT', 'DR1TCHOL',
       'DR1TSODI', 'DR1TVD', 'DR1TCALC', 'DR1TIRON', 'DR1TPOTA', 'DR1_300',
       'ALQ120Q', 'ALQ120U', 'ALQ130', 'BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681', 'PAQ610', 'PAD615', 'PAQ625',
       'PAD630', 'PAQ655', 'PAD660', 'PAQ670', 'PAD675', 'Systolic',
       'Diastolic', 'outlier', 'MET_work', 'MET_rec', 'DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t', 'avgALC',
       'log_LBXTC'],
      dtype='object')

In [16]:

cat_variables = ['RIAGENDR','RIDRETH3','ALQ120U','BPQ020', 'BPQ050A', 'BPQ080',
       'BPQ100D', 'DIQ010', 'MCQ170M', 'SMQ681']
numeric_variables = ['RIDAGEYR','BMXWT', 'BMXHT','BPXPLS','Systolic',
       'Diastolic', 'ALQ120Q','ALQ130', 'MET_rec', 'DR1TKCAL_t',
       'DR1TPROT_t', 'DR1TCARB_t', 'DR1TSUGR_t', 'DR1TFIBE_t', 'DR1TTFAT_t',
       'DR1TSFAT_t', 'DR1TMFAT_t', 'DR1TPFAT_t', 'DR1TCHOL_t', 'DR1TSODI_t',
       'DR1TVD_t', 'DR1TCALC_t', 'DR1TIRON_t', 'DR1TPOTA_t']
df2 = df[cat_variables+numeric_variables+['LBXTC']].copy()
df2.dropna(inplace=True)
df2.shape

(3551, 35)

In [12]:
y = df2['LBXTC']
X = df2.drop('LBXTC',axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=675, test_size=0.2)

In [38]:
def build_model():
    model = Sequential([
        Dense(1024, activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(1024, activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(1024, activation='relu'),
        Dropout(0.3),
        BatchNormalization(),
        Dense(1),
    ])
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [41]:
numeric_transformer = Pipeline(steps=[('scalar',StandardScaler())])
cat_transformer = Pipeline(steps=[('one_hot',OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[
    ('num',numeric_transformer,numeric_variables),
    ('cat',cat_transformer,cat_variables)])
pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                          ('mlp', KerasRegressor(build_fn=build_model, epochs=25, batch_size=5, verbose=2))])

In [43]:
results = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_absolute_error')

Epoch 1/25
455/455 - 4s - loss: 18809.0703
Epoch 2/25
455/455 - 4s - loss: 1871.0674
Epoch 3/25
455/455 - 4s - loss: 1797.2344
Epoch 4/25
455/455 - 4s - loss: 1723.2404
Epoch 5/25
455/455 - 4s - loss: 1738.3856
Epoch 6/25
455/455 - 4s - loss: 1731.2789
Epoch 7/25
455/455 - 4s - loss: 1675.0461
Epoch 8/25
455/455 - 4s - loss: 1683.8945
Epoch 9/25
455/455 - 4s - loss: 1713.8785
Epoch 10/25
455/455 - 4s - loss: 1680.7505
Epoch 11/25
455/455 - 4s - loss: 1624.0707
Epoch 12/25
455/455 - 4s - loss: 1626.6125
Epoch 13/25
455/455 - 4s - loss: 1657.3723
Epoch 14/25
455/455 - 4s - loss: 1600.5458
Epoch 15/25
455/455 - 4s - loss: 1618.4526
Epoch 16/25
455/455 - 4s - loss: 1611.5377
Epoch 17/25
455/455 - 4s - loss: 1611.4709
Epoch 18/25
455/455 - 4s - loss: 1603.5452
Epoch 19/25
455/455 - 4s - loss: 1579.4937
Epoch 20/25
455/455 - 4s - loss: 1580.7325
Epoch 21/25
455/455 - 4s - loss: 1549.8821
Epoch 22/25
455/455 - 4s - loss: 1594.8655
Epoch 23/25
455/455 - 4s - loss: 1555.9110
Epoch 24/25
455/455

In [44]:
print(results.mean(),results.std())

-33.78652339082369 1.067943715341327


In [45]:
pipeline.fit(X_train,y_train)

Epoch 1/25
568/568 - 5s - loss: 15207.7607
Epoch 2/25
568/568 - 4s - loss: 1782.8706
Epoch 3/25
568/568 - 5s - loss: 1738.2010
Epoch 4/25
568/568 - 4s - loss: 1704.3073
Epoch 5/25
568/568 - 4s - loss: 1711.0558
Epoch 6/25
568/568 - 4s - loss: 1677.7723
Epoch 7/25
568/568 - 5s - loss: 1645.2993
Epoch 8/25
568/568 - 4s - loss: 1645.1549
Epoch 9/25
568/568 - 4s - loss: 1635.8140
Epoch 10/25
568/568 - 5s - loss: 1644.3810
Epoch 11/25
568/568 - 5s - loss: 1619.8826
Epoch 12/25
568/568 - 5s - loss: 1635.8679
Epoch 13/25
568/568 - 5s - loss: 1616.8405
Epoch 14/25
568/568 - 5s - loss: 1605.3528
Epoch 15/25
568/568 - 5s - loss: 1561.1179
Epoch 16/25
568/568 - 5s - loss: 1584.1115
Epoch 17/25
568/568 - 5s - loss: 1593.2266
Epoch 18/25
568/568 - 5s - loss: 1561.6509
Epoch 19/25
568/568 - 5s - loss: 1563.2583
Epoch 20/25
568/568 - 5s - loss: 1569.5507
Epoch 21/25
568/568 - 5s - loss: 1583.7426
Epoch 22/25
568/568 - 5s - loss: 1562.8809
Epoch 23/25
568/568 - 5s - loss: 1530.9633
Epoch 24/25
568/568

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scalar',
                                                                   StandardScaler())]),
                                                  ['RIDAGEYR', 'BMXWT', 'BMXHT',
                                                   'BPXPLS', 'Systolic',
                                                   'Diastolic', 'ALQ120Q',
                                                   'ALQ130', 'MET_rec',
                                                   'DR1TKCAL_t', 'DR1TPROT_t',
                                                   'DR1TCARB_t', 'DR1TSUGR_t',
                                                   'DR1TFIBE_t', 'DR1TTFAT_t',
                                                   'DR1TSFAT_t', 'DR1TMFAT_t',
                                                   'DR1TPFAT_t', 'DR1TCHOL_t',
                                                 

In [47]:
y_preds = pipeline.predict(X_train)

568/568 - 1s


In [48]:
mean_absolute_error(y_train,y_preds)

32.00487282981335